Merge branch 'drm-next-5.1' of git://people.freedesktop.org/~agd5f/linux into drm-next

New stuff for 5.1. amdgpu: - DC bandwidth formula updates - Support for DCC on scanout surfaces - Support for multiple IH rings on soc15 asics - Fix xgmi locking - Add sysfs interface to get pcie usage stats - Simplify DC i2c/aux code - Initial support for BACO on vega10/20 - New runtime SMU feature debug interface - Expand existing sysfs power interfaces to new clock domains - Handle kexec properly - Simplify IH programming - Rework doorbell handling across asics - Drop old CI DPM implementation - DC page flipping fixes - Misc SR-IOV fixes amdkfd: - Simplify the interfaces between amdkfd and amdgpu ttm: - Add a callback to notify the driver when the lru changes sched: - Refactor mirror list handling - Rework hw fence processing Signed-off-by: Dave Airlie <airlied@redhat.com> From: Alex Deucher <alexdeucher@gmail.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190125231517.26268-1-alexander.deucher@amd.com
author: Dave Airlie <airlied@redhat.com> 2019-01-31 14:19:45 +1000
committer: Dave Airlie <airlied@redhat.com> 2019-02-01 09:34:20 +1000
commit: e09191d360ab54c7ac7a9b09287dcab81c30d8e3 (patch)
tree: be0c0a0ed2f0603ecaa430bd09811484adc04ee6 /drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
parent: f91168f48556486743392b8838e20afbd84b7b7a (diff)
parent: 10117450735c7a7c0858095fb46a860e7037cb9a (diff)
1 files changed, 43 insertions, 34 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 28bccceaa363..c898b19f335a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1645,7 +1645,7 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
 		if (r) {
 			DRM_ERROR("sw_init of IP block <%s> failed %d\n",
 				  adev->ip_blocks[i].version->funcs->name, r);
-			return r;
+			goto init_failed;
 		}
 		adev->ip_blocks[i].status.sw = true;
 
@@ -1654,17 +1654,17 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
 			r = amdgpu_device_vram_scratch_init(adev);
 			if (r) {
 				DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
-				return r;
+				goto init_failed;
 			}
 			r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
 			if (r) {
 				DRM_ERROR("hw_init %d failed %d\n", i, r);
-				return r;
+				goto init_failed;
 			}
 			r = amdgpu_device_wb_init(adev);
 			if (r) {
 				DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
-				return r;
+				goto init_failed;
 			}
 			adev->ip_blocks[i].status.hw = true;
 
@@ -1675,7 +1675,7 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
 								AMDGPU_CSA_SIZE);
 				if (r) {
 					DRM_ERROR("allocate CSA failed %d\n", r);
-					return r;
+					goto init_failed;
 				}
 			}
 		}
@@ -1683,28 +1683,32 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
 
 	r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
 	if (r)
-		return r;
+		goto init_failed;
 
 	r = amdgpu_device_ip_hw_init_phase1(adev);
 	if (r)
-		return r;
+		goto init_failed;
 
 	r = amdgpu_device_fw_loading(adev);
 	if (r)
-		return r;
+		goto init_failed;
 
 	r = amdgpu_device_ip_hw_init_phase2(adev);
 	if (r)
-		return r;
+		goto init_failed;
 
 	if (adev->gmc.xgmi.num_physical_nodes > 1)
 		amdgpu_xgmi_add_device(adev);
 	amdgpu_amdkfd_device_init(adev);
 
-	if (amdgpu_sriov_vf(adev))
+init_failed:
+	if (amdgpu_sriov_vf(adev)) {
+		if (!r)
+			amdgpu_virt_init_data_exchange(adev);
 		amdgpu_virt_release_full_gpu(adev, true);
+	}
 
-	return 0;
+	return r;
 }
 
 /**
@@ -2131,7 +2135,7 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
 				continue;
 
 			r = block->version->funcs->hw_init(adev);
-			DRM_INFO("RE-INIT: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
+			DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
 			if (r)
 				return r;
 		}
@@ -2165,7 +2169,7 @@ static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
 				continue;
 
 			r = block->version->funcs->hw_init(adev);
-			DRM_INFO("RE-INIT: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
+			DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
 			if (r)
 				return r;
 		}
@@ -2546,6 +2550,17 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 	/* detect if we are with an SRIOV vbios */
 	amdgpu_device_detect_sriov_bios(adev);
 
+	/* check if we need to reset the asic
+	 *  E.g., driver was not cleanly unloaded previously, etc.
+	 */
+	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
+		r = amdgpu_asic_reset(adev);
+		if (r) {
+			dev_err(adev->dev, "asic reset on init failed\n");
+			goto failed;
+		}
+	}
+
 	/* Post card if necessary */
 	if (amdgpu_device_need_post(adev)) {
 		if (!adev->bios) {
@@ -2610,6 +2625,8 @@ fence_driver_init:
 		}
 		dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
+		if (amdgpu_virt_request_full_gpu(adev, false))
+			amdgpu_virt_release_full_gpu(adev, false);
 		goto failed;
 	}
 
@@ -2632,9 +2649,6 @@ fence_driver_init:
 		goto failed;
 	}
 
-	if (amdgpu_sriov_vf(adev))
-		amdgpu_virt_init_data_exchange(adev);
-
 	amdgpu_fbdev_init(adev);
 
 	r = amdgpu_pm_sysfs_init(adev);
@@ -2798,7 +2812,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon)
 			struct drm_framebuffer *fb = crtc->primary->fb;
 			struct amdgpu_bo *robj;
 
-			if (amdgpu_crtc->cursor_bo) {
+			if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
 				struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
 				r = amdgpu_bo_reserve(aobj, true);
 				if (r == 0) {
@@ -2906,7 +2920,7 @@ int amdgpu_device_resume(struct drm_device *dev, bool resume, bool fbcon)
 		list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
 			struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
 
-			if (amdgpu_crtc->cursor_bo) {
+			if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
 				struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
 				r = amdgpu_bo_reserve(aobj, true);
 				if (r == 0) {
@@ -3226,6 +3240,7 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
 	r = amdgpu_ib_ring_tests(adev);
 
 error:
+	amdgpu_virt_init_data_exchange(adev);
 	amdgpu_virt_release_full_gpu(adev, true);
 	if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
 		atomic_inc(&adev->vram_lost_counter);
@@ -3298,17 +3313,15 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 		if (!ring || !ring->sched.thread)
 			continue;
 
-		kthread_park(ring->sched.thread);
-
-		if (job && job->base.sched != &ring->sched)
-			continue;
-
-		drm_sched_hw_job_reset(&ring->sched, job ? &job->base : NULL);
+		drm_sched_stop(&ring->sched);
 
 		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
 		amdgpu_fence_driver_force_completion(ring);
 	}
 
+	if(job)
+		drm_sched_increase_karma(&job->base);
+
 
 
 	if (!amdgpu_sriov_vf(adev)) {
@@ -3454,14 +3467,10 @@ static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev,
 		if (!ring || !ring->sched.thread)
 			continue;
 
-		/* only need recovery sched of the given job's ring
-		 * or all rings (in the case @job is NULL)
-		 * after above amdgpu_reset accomplished
-		 */
-		if ((!job || job->base.sched == &ring->sched) && !adev->asic_reset_res)
-			drm_sched_job_recovery(&ring->sched);
+		if (!adev->asic_reset_res)
+			drm_sched_resubmit_jobs(&ring->sched);
 
-		kthread_unpark(ring->sched.thread);
+		drm_sched_start(&ring->sched, !adev->asic_reset_res);
 	}
 
 	if (!amdgpu_device_has_dc_support(adev)) {
@@ -3521,9 +3530,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	 * by different nodes. No point also since the one node already executing
 	 * reset will also reset all the other nodes in the hive.
 	 */
-	hive = amdgpu_get_xgmi_hive(adev);
+	hive = amdgpu_get_xgmi_hive(adev, 0);
 	if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
-	    !mutex_trylock(&hive->hive_lock))
+	    !mutex_trylock(&hive->reset_lock))
 		return 0;
 
 	/* Start with adev pre asic reset first for soft reset check.*/
@@ -3602,7 +3611,7 @@ retry:	/* Rest of adevs pre asic reset from XGMI hive. */
 	}
 
 	if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
-		mutex_unlock(&hive->hive_lock);
+		mutex_unlock(&hive->reset_lock);
 
 	if (r)
 		dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
author	Dave Airlie <airlied@redhat.com>	2019-01-31 14:19:45 +1000
committer	Dave Airlie <airlied@redhat.com>	2019-02-01 09:34:20 +1000
commit	e09191d360ab54c7ac7a9b09287dcab81c30d8e3 (patch)
tree	be0c0a0ed2f0603ecaa430bd09811484adc04ee6 /drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
parent	f91168f48556486743392b8838e20afbd84b7b7a (diff)
parent	10117450735c7a7c0858095fb46a860e7037cb9a (diff)