diff options
| author | Maarten Lankhorst <maarten.lankhorst@linux.intel.com> | 2024-11-04 14:44:16 +0100 | 
|---|---|---|
| committer | Maarten Lankhorst <maarten.lankhorst@linux.intel.com> | 2024-11-04 14:45:21 +0100 | 
| commit | d78f0ee0406803cda8801fd5201746ccf89e5e4a (patch) | |
| tree | d0d145319344a07cb6f4b47578f01236b71ad473 /drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | |
| parent | a88f9ed63b3cec761b04cba8104b2d0b2b66b25d (diff) | |
| parent | 30169bb64580bd7bce9290c1952bf0aa6cc37fe5 (diff) | |
Merge remote-tracking branch 'drm/drm-next' into drm-misc-next
Didn't notice drm/drm-next had the build fix for drm_bridge, so ended up
committing the same patch. Sync with drm and pretend it didn't happen?
Signed-off-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 140 | 
1 files changed, 135 insertions, 5 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 7de449fae1e3..b47422b0b5b1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -667,6 +667,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)  	task_barrier_init(&hive->tb);  	hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN;  	hive->hi_req_gpu = NULL; +	atomic_set(&hive->requested_nps_mode, UNKNOWN_MEMORY_PARTITION_MODE);  	/*  	 * hive pstate on boot is high in vega20 so we have to go to low @@ -800,6 +801,23 @@ int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev,  	return	-EINVAL;  } +bool amdgpu_xgmi_get_is_sharing_enabled(struct amdgpu_device *adev, +					struct amdgpu_device *peer_adev) +{ +	struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; +	int i; + +	/* Sharing should always be enabled for non-SRIOV. */ +	if (!amdgpu_sriov_vf(adev)) +		return true; + +	for (i = 0 ; i < top->num_nodes; ++i) +		if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id) +			return !!top->nodes[i].is_sharing_enabled; + +	return false; +} +  /*   * Devices that support extended data require the entire hive to initialize with   * the shared memory buffer flag set. @@ -860,8 +878,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)  	if (!adev->gmc.xgmi.supported)  		return 0; -	if (!adev->gmc.xgmi.pending_reset && -	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) { +	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {  		ret = psp_xgmi_initialize(&adev->psp, false, true);  		if (ret) {  			dev_err(adev->dev, @@ -907,8 +924,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)  	task_barrier_add_task(&hive->tb); -	if (!adev->gmc.xgmi.pending_reset && -	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) { +	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {  		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {  			/* update node list for other device in the hive */  			if (tmp_adev != adev) { @@ -985,7 +1001,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)  		}  	} -	if (!ret && !adev->gmc.xgmi.pending_reset) +	if (!ret)  		ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);  exit_unlock: @@ -1500,3 +1516,117 @@ int amdgpu_xgmi_ras_sw_init(struct amdgpu_device *adev)  	return 0;  } + +static void amdgpu_xgmi_reset_on_init_work(struct work_struct *work) +{ +	struct amdgpu_hive_info *hive = +		container_of(work, struct amdgpu_hive_info, reset_on_init_work); +	struct amdgpu_reset_context reset_context; +	struct amdgpu_device *tmp_adev; +	struct list_head device_list; +	int r; + +	mutex_lock(&hive->hive_lock); + +	INIT_LIST_HEAD(&device_list); +	list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) +		list_add_tail(&tmp_adev->reset_list, &device_list); + +	tmp_adev = list_first_entry(&device_list, struct amdgpu_device, +				    reset_list); +	amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); + +	reset_context.method = AMD_RESET_METHOD_ON_INIT; +	reset_context.reset_req_dev = tmp_adev; +	reset_context.hive = hive; +	reset_context.reset_device_list = &device_list; +	set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); +	set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); + +	amdgpu_reset_do_xgmi_reset_on_init(&reset_context); +	mutex_unlock(&hive->hive_lock); +	amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); + +	list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { +		r = amdgpu_ras_init_badpage_info(tmp_adev); +		if (r && r != -EHWPOISON) +			dev_err(tmp_adev->dev, +				"error during bad page data initialization"); +	} +} + +static void amdgpu_xgmi_schedule_reset_on_init(struct amdgpu_hive_info *hive) +{ +	INIT_WORK(&hive->reset_on_init_work, amdgpu_xgmi_reset_on_init_work); +	amdgpu_reset_domain_schedule(hive->reset_domain, +				     &hive->reset_on_init_work); +} + +int amdgpu_xgmi_reset_on_init(struct amdgpu_device *adev) +{ +	struct amdgpu_hive_info *hive; +	bool reset_scheduled; +	int num_devs; + +	hive = amdgpu_get_xgmi_hive(adev); +	if (!hive) +		return -EINVAL; + +	mutex_lock(&hive->hive_lock); +	num_devs = atomic_read(&hive->number_devices); +	reset_scheduled = false; +	if (num_devs == adev->gmc.xgmi.num_physical_nodes) { +		amdgpu_xgmi_schedule_reset_on_init(hive); +		reset_scheduled = true; +	} + +	mutex_unlock(&hive->hive_lock); +	amdgpu_put_xgmi_hive(hive); + +	if (reset_scheduled) +		flush_work(&hive->reset_on_init_work); + +	return 0; +} + +int amdgpu_xgmi_request_nps_change(struct amdgpu_device *adev, +				   struct amdgpu_hive_info *hive, +				   int req_nps_mode) +{ +	struct amdgpu_device *tmp_adev; +	int cur_nps_mode, r; + +	/* This is expected to be called only during unload of driver. The +	 * request needs to be placed only once for all devices in the hive. If +	 * one of them fail, revert the request for previous successful devices. +	 * After placing the request, make hive mode as UNKNOWN so that other +	 * devices don't request anymore. +	 */ +	mutex_lock(&hive->hive_lock); +	if (atomic_read(&hive->requested_nps_mode) == +	    UNKNOWN_MEMORY_PARTITION_MODE) { +		dev_dbg(adev->dev, "Unexpected entry for hive NPS change"); +		mutex_unlock(&hive->hive_lock); +		return 0; +	} +	list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { +		r = adev->gmc.gmc_funcs->request_mem_partition_mode( +			tmp_adev, req_nps_mode); +		if (r) +			break; +	} +	if (r) { +		/* Request back current mode if one of the requests failed */ +		cur_nps_mode = +			adev->gmc.gmc_funcs->query_mem_partition_mode(tmp_adev); +		list_for_each_entry_continue_reverse( +			tmp_adev, &hive->device_list, gmc.xgmi.head) +			adev->gmc.gmc_funcs->request_mem_partition_mode( +				tmp_adev, cur_nps_mode); +	} +	/* Set to UNKNOWN so that other devices don't request anymore */ +	atomic_set(&hive->requested_nps_mode, UNKNOWN_MEMORY_PARTITION_MODE); +	mutex_unlock(&hive->hive_lock); + +	return r; +}  | 
