diff options
author | Dave Airlie <airlied@redhat.com> | 2023-10-18 16:08:07 +1000 |
---|---|---|
committer | Dave Airlie <airlied@redhat.com> | 2023-10-18 16:08:07 +1000 |
commit | 27442758e9b4e083bef3f164a1739475c01f3202 (patch) | |
tree | 58f4a82f1b9e41c11b6247d8f7b9c3c9b8b52711 /drivers/gpu/drm/amd/amdkfd | |
parent | 08057253366d916a73e62bafb913d9b659228cc1 (diff) | |
parent | cd90511557fdfb394bb4ac4c3b539b007383914c (diff) |
Merge tag 'amd-drm-next-6.7-2023-10-13' of https://gitlab.freedesktop.org/agd5f/linux into drm-next
amd-drm-next-6.7-2023-10-13:
amdgpu:
- DC replay fixes
- Misc code cleanups and spelling fixes
- Documentation updates
- RAS EEPROM Updates
- FRU EEPROM Updates
- IP discovery updates
- SR-IOV fixes
- RAS updates
- DC PQ fixes
- SMU 13.0.6 updates
- GC 11.5 Support
- NBIO 7.11 Support
- GMC 11 Updates
- Reset fixes
- SMU 11.5 Updates
- SMU 13.0 OD support
- Use flexible arrays for bo list handling
- W=1 Fixes
- SubVP fixes
- DPIA fixes
- DCN 3.5 Support
- Devcoredump fixes
- VPE 6.1 support
- VCN 4.0 Updates
- S/G display fixes
- DML fixes
- DML2 Support
- MST fixes
- VRR fixes
- Enable seamless boot in more cases
- Enable content type property for HDMI
- OLED fixes
- Rework and clean up GPUVM TLB flushing
- DC ODM fixes
- DP 2.x fixes
- AGP aperture fixes
- SDMA firmware loading cleanups
- Cyan Skillfish GPU clock counter fix
- GC 11 GART fix
- Cache GPU fault info for userspace queries
- DC cursor check fixes
- eDP fixes
- DC FP handling fixes
- Variable sized array fixes
- SMU 13.0.x fixes
- IB start and size alignment fixes for VCN
- SMU 14 Support
- Suspend and resume sequence rework
- vkms fix
amdkfd:
- GC 11 fixes
- GC 10 fixes
- Doorbell fixes
- CWSR fixes
- SVM fixes
- Clean up GC info enumeration
- Rework memory limit handling
- Coherent memory handling fixes
- Use partial migrations in GPU faults
- TLB flush fixes
- DMA unmap fixes
- GC 9.4.3 fixes
- SQ interrupt fix
- GTT mapping fix
- GC 11.5 Support
radeon:
- Misc code cleanups
- W=1 Fixes
- Fix possible buffer overflow
- Fix possible NULL pointer dereference
UAPI:
- Add EXT_COHERENT memory allocation flags. These allow for system scope atomics.
Proposed userspace: https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/pull/88
- Add support for new VPE engine. This is a memory to memory copy engine with advanced scaling, CSC, and color management features
Proposed mesa MR: https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25713
- Add INFO IOCTL interface to query GPU faults
Proposed Mesa MR: https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23238
Proposed libdrm MR: https://gitlab.freedesktop.org/mesa/drm/-/merge_requests/298
Signed-off-by: Dave Airlie <airlied@redhat.com>
From: Alex Deucher <alexander.deucher@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20231013175758.1735031-1-alexander.deucher@amd.com
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd')
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 22 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 31 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_device.c | 19 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 131 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c | 10 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c | 6 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c | 6 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 6 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 156 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_migrate.h | 6 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c | 28 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c | 3 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 4 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 3 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 217 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 12 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 44 |
17 files changed, 418 insertions, 286 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index c37f1fcd2165..f6d4748c1980 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -1021,7 +1021,7 @@ err_drm_file: bool kfd_dev_is_large_bar(struct kfd_node *dev) { - if (debug_largebar) { + if (dev->kfd->adev->debug_largebar) { pr_debug("Simulate large-bar allocation on non large-bar machine\n"); return true; } @@ -1432,17 +1432,21 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, goto sync_memory_failed; } } - mutex_unlock(&p->mutex); - if (flush_tlb) { - /* Flush TLBs after waiting for the page table updates to complete */ - for (i = 0; i < args->n_devices; i++) { - peer_pdd = kfd_process_device_data_by_id(p, devices_arr[i]); - if (WARN_ON_ONCE(!peer_pdd)) - continue; + /* Flush TLBs after waiting for the page table updates to complete */ + for (i = 0; i < args->n_devices; i++) { + peer_pdd = kfd_process_device_data_by_id(p, devices_arr[i]); + if (WARN_ON_ONCE(!peer_pdd)) + continue; + if (flush_tlb) kfd_flush_tlb(peer_pdd, TLB_FLUSH_HEAVYWEIGHT); - } + + /* Remove dma mapping after tlb flush to avoid IO_PAGE_FAULT */ + amdgpu_amdkfd_gpuvm_dmaunmap_mem(mem, peer_pdd->drm_priv); } + + mutex_unlock(&p->mutex); + kfree(devices_arr); return 0; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c index f76b7aee5c0a..0e792a8496d6 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c @@ -1522,6 +1522,7 @@ int kfd_get_gpu_cache_info(struct kfd_node *kdev, struct kfd_gpu_cache_info **pc case IP_VERSION(11, 0, 2): case IP_VERSION(11, 0, 3): case IP_VERSION(11, 0, 4): + case IP_VERSION(11, 5, 0): num_of_cache_types = kfd_fill_gpu_cache_info_from_gfx_config(kdev->kfd, *pcache_info); break; @@ -2037,11 +2038,12 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image, uint32_t proximity_domain) { struct crat_header *crat_table = (struct crat_header *)pcrat_image; + struct amdgpu_gfx_config *gfx_info = &kdev->adev->gfx.config; + struct amdgpu_cu_info *cu_info = &kdev->adev->gfx.cu_info; struct crat_subtype_generic *sub_type_hdr; struct kfd_local_mem_info local_mem_info; struct kfd_topology_device *peer_dev; struct crat_subtype_computeunit *cu; - struct kfd_cu_info cu_info; int avail_size = *size; uint32_t total_num_of_cu; uint32_t nid = 0; @@ -2085,21 +2087,20 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image, cu->flags |= CRAT_CU_FLAGS_GPU_PRESENT; cu->proximity_domain = proximity_domain; - amdgpu_amdkfd_get_cu_info(kdev->adev, &cu_info); - cu->num_simd_per_cu = cu_info.simd_per_cu; - cu->num_simd_cores = cu_info.simd_per_cu * - (cu_info.cu_active_number / kdev->kfd->num_nodes); - cu->max_waves_simd = cu_info.max_waves_per_simd; + cu->num_simd_per_cu = cu_info->simd_per_cu; + cu->num_simd_cores = cu_info->simd_per_cu * + (cu_info->number / kdev->kfd->num_nodes); + cu->max_waves_simd = cu_info->max_waves_per_simd; - cu->wave_front_size = cu_info.wave_front_size; - cu->array_count = cu_info.num_shader_arrays_per_engine * - cu_info.num_shader_engines; - total_num_of_cu = (cu->array_count * cu_info.num_cu_per_sh); + cu->wave_front_size = cu_info->wave_front_size; + cu->array_count = gfx_info->max_sh_per_se * + gfx_info->max_shader_engines; + total_num_of_cu = (cu->array_count * gfx_info->max_cu_per_sh); cu->processor_id_low = get_and_inc_gpu_processor_id(total_num_of_cu); - cu->num_cu_per_array = cu_info.num_cu_per_sh; - cu->max_slots_scatch_cu = cu_info.max_scratch_slots_per_cu; - cu->num_banks = cu_info.num_shader_engines; - cu->lds_size_in_kb = cu_info.lds_size; + cu->num_cu_per_array = gfx_info->max_cu_per_sh; + cu->max_slots_scatch_cu = cu_info->max_scratch_slots_per_cu; + cu->num_banks = gfx_info->max_shader_engines; + cu->lds_size_in_kb = cu_info->lds_size; cu->hsa_capability = 0; @@ -2115,7 +2116,7 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image, sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + sub_type_hdr->length); - if (debug_largebar) + if (kdev->adev->debug_largebar) local_mem_info.local_mem_size_private = 0; if (local_mem_info.local_mem_size_private == 0) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 93ce181eb3ba..0a9cf9dfc224 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -65,7 +65,7 @@ static int kfd_resume(struct kfd_node *kfd); static void kfd_device_info_set_sdma_info(struct kfd_dev *kfd) { - uint32_t sdma_version = kfd->adev->ip_versions[SDMA0_HWIP][0]; + uint32_t sdma_version = amdgpu_ip_version(kfd->adev, SDMA0_HWIP, 0); switch (sdma_version) { case IP_VERSION(4, 0, 0):/* VEGA10 */ @@ -95,6 +95,7 @@ static void kfd_device_info_set_sdma_info(struct kfd_dev *kfd) case IP_VERSION(6, 0, 1): case IP_VERSION(6, 0, 2): case IP_VERSION(6, 0, 3): + case IP_VERSION(6, 1, 0): kfd->device_info.num_sdma_queues_per_engine = 8; break; default: @@ -111,6 +112,7 @@ static void kfd_device_info_set_sdma_info(struct kfd_dev *kfd) case IP_VERSION(6, 0, 1): case IP_VERSION(6, 0, 2): case IP_VERSION(6, 0, 3): + case IP_VERSION(6, 1, 0): /* Reserve 1 for paging and 1 for gfx */ kfd->device_info.num_reserved_sdma_queues_per_engine = 2; /* BIT(0)=engine-0 queue-0; BIT(1)=engine-1 queue-0; BIT(2)=engine-0 queue-1; ... */ @@ -162,6 +164,7 @@ static void kfd_device_info_set_event_interrupt_class(struct kfd_dev *kfd) case IP_VERSION(11, 0, 2): case IP_VERSION(11, 0, 3): case IP_VERSION(11, 0, 4): + case IP_VERSION(11, 5, 0): kfd->device_info.event_interrupt_class = &event_interrupt_class_v11; break; default: @@ -279,7 +282,7 @@ struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, bool vf) f2g = &gfx_v8_kfd2kgd; break; default: - switch (adev->ip_versions[GC_HWIP][0]) { + switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { /* Vega 10 */ case IP_VERSION(9, 0, 1): gfx_target_version = 90000; @@ -413,6 +416,10 @@ struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, bool vf) gfx_target_version = 110001; f2g = &gfx_v11_kfd2kgd; break; + case IP_VERSION(11, 5, 0): + gfx_target_version = 110500; + f2g = &gfx_v11_kfd2kgd; + break; default: break; } @@ -420,9 +427,11 @@ struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, bool vf) } if (!f2g) { - if (adev->ip_versions[GC_HWIP][0]) - dev_err(kfd_device, "GC IP %06x %s not supported in kfd\n", - adev->ip_versions[GC_HWIP][0], vf ? "VF" : ""); + if (amdgpu_ip_version(adev, GC_HWIP, 0)) + dev_err(kfd_device, + "GC IP %06x %s not supported in kfd\n", + amdgpu_ip_version(adev, GC_HWIP, 0), + vf ? "VF" : ""); else dev_err(kfd_device, "%s %s not supported in kfd\n", amdgpu_asic_name[adev->asic_type], vf ? "VF" : ""); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 8a6cb41444a4..c0e71543389a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -92,7 +92,7 @@ static bool is_pipe_enabled(struct device_queue_manager *dqm, int mec, int pipe) unsigned int get_cp_queues_num(struct device_queue_manager *dqm) { return bitmap_weight(dqm->dev->kfd->shared_resources.cp_queue_bitmap, - KGD_MAX_QUEUES); + AMDGPU_MAX_QUEUES); } unsigned int get_queues_per_pipe(struct device_queue_manager *dqm) @@ -216,7 +216,7 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q, if (q->wptr_bo) { wptr_addr_off = (uint64_t)q->properties.write_ptr & (PAGE_SIZE - 1); - queue_input.wptr_mc_addr = ((uint64_t)q->wptr_bo->tbo.resource->start << PAGE_SHIFT) + wptr_addr_off; + queue_input.wptr_mc_addr = amdgpu_bo_gpu_offset(q->wptr_bo) + wptr_addr_off; } queue_input.is_kfd_process = 1; @@ -227,13 +227,15 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q, queue_input.tba_addr = qpd->tba_addr; queue_input.tma_addr = qpd->tma_addr; queue_input.trap_en = !kfd_dbg_has_cwsr_workaround(q->device); - queue_input.skip_process_ctx_clear = qpd->pqm->process->debug_trap_enabled || - kfd_dbg_has_ttmps_always_setup(q->device); + queue_input.skip_process_ctx_clear = + qpd->pqm->process->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED && + (qpd->pqm->process->debug_trap_enabled || + kfd_dbg_has_ttmps_always_setup(q->device)); queue_type = convert_to_mes_queue_type(q->properties.type); if (queue_type < 0) { - pr_err("Queue type not supported with MES, queue:%d\n", - q->properties.type); + dev_err(adev->dev, "Queue type not supported with MES, queue:%d\n", + q->properties.type); return -EINVAL; } queue_input.queue_type = (uint32_t)queue_type; @@ -244,9 +246,9 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q, r = adev->mes.funcs->add_hw_queue(&adev->mes, &queue_input); amdgpu_mes_unlock(&adev->mes); if (r) { - pr_err("failed to add hardware queue to MES, doorbell=0x%x\n", + dev_err(adev->dev, "failed to add hardware queue to MES, doorbell=0x%x\n", q->properties.doorbell_off); - pr_err("MES might be in unrecoverable state, issue a GPU reset\n"); + dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n"); kfd_hws_hang(dqm); } @@ -272,9 +274,9 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q, amdgpu_mes_unlock(&adev->mes); if (r) { - pr_err("failed to remove hardware queue from MES, doorbell=0x%x\n", + dev_err(adev->dev, "failed to remove hardware queue from MES, doorbell=0x%x\n", q->properties.doorbell_off); - pr_err("MES might be in unrecoverable state, issue a GPU reset\n"); + dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n"); kfd_hws_hang(dqm); } @@ -284,6 +286,7 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q, static int remove_all_queues_mes(struct device_queue_manager *dqm) { struct device_process_node *cur; + struct device *dev = dqm->dev->adev->dev; struct qcm_process_device *qpd; struct queue *q; int retval = 0; @@ -294,7 +297,7 @@ static int remove_all_queues_mes(struct device_queue_manager *dqm) if (q->properties.is_active) { retval = remove_queue_mes(dqm, q, qpd); if (retval) { - pr_err("%s: Failed to remove queue %d for dev %d", + dev_err(dev, "%s: Failed to remove queue %d for dev %d", __func__, q->properties.queue_id, dqm->dev->id); @@ -407,7 +410,8 @@ static int allocate_doorbell(struct qcm_process_device *qpd, q->properties.doorbell_off = amdgpu_doorbell_index_on_bar(dev->adev, qpd->proc_doorbells, - q->doorbell_id); + q->doorbell_id, + dev->kfd->device_info.doorbell_size); return 0; } @@ -443,6 +447,7 @@ static int allocate_vmid(struct device_queue_manager *dqm, struct qcm_process_device *qpd, struct queue *q) { + struct device *dev = dqm->dev->adev->dev; int allocated_vmid = -1, i; for (i = dqm->dev->vm_info.first_vmid_kfd; @@ -454,7 +459,7 @@ static int allocate_vmid(struct device_queue_manager *dqm, } if (allocated_vmid < 0) { - pr_err("no more vmid to allocate\n"); + dev_err(dev, "no more vmid to allocate\n"); return -ENOSPC; } @@ -510,10 +515,12 @@ static void deallocate_vmid(struct device_queue_manager *dqm, struct qcm_process_device *qpd, struct queue *q) { + struct device *dev = dqm->dev->adev->dev; + /* On GFX v7, CP doesn't flush TC at dequeue */ if (q->device->adev->asic_type == CHIP_HAWAII) if (flush_texture_cache_nocpsch(q->device, qpd)) - pr_err("Failed to flush TC\n"); + dev_err(dev, "Failed to flush TC\n"); kfd_flush_tlb(qpd_to_pdd(qpd), TLB_FLUSH_LEGACY); @@ -708,7 +715,7 @@ static int dbgdev_wave_reset_wavefronts(struct kfd_node *dev, struct kfd_process pr_debug("Killing all process wavefronts\n"); if (!dev->kfd2kgd->get_atc_vmid_pasid_mapping_info) { - pr_err("no vmid pasid mapping supported \n"); + dev_err(dev->adev->dev, "no vmid pasid mapping supported\n"); return -EOPNOTSUPP; } @@ -729,7 +736,7 @@ static int dbgdev_wave_reset_wavefronts(struct kfd_node *dev, struct kfd_process } if (vmid > last_vmid_to_scan) { - pr_err("Didn't find vmid for pasid 0x%x\n", p->pasid); + dev_err(dev->adev->dev, "Didn't find vmid for pasid 0x%x\n", p->pasid); return -EFAULT; } @@ -821,6 +828,7 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm, { int retval; uint64_t sdma_val = 0; + struct device *dev = dqm->dev->adev->dev; struct kfd_process_device *pdd = qpd_to_pdd(qpd); struct mqd_manager *mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(q->properties.type)]; @@ -831,7 +839,7 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm, retval = read_sdma_queue_counter((uint64_t __user *)q->properties.read_ptr, &sdma_val); if (retval) - pr_err("Failed to read SDMA queue counter for queue: %d\n", + dev_err(dev, "Failed to read SDMA queue counter for queue: %d\n", q->properties.queue_id); } @@ -850,6 +858,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q, struct mqd_update_info *minfo) { int retval = 0; + struct device *dev = dqm->dev->adev->dev; struct mqd_manager *mqd_mgr; struct kfd_process_device *pdd; bool prev_active = false; @@ -875,7 +884,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q, retval = remove_queue_mes(dqm, q, &pdd->qpd); if (retval) { - pr_err("unmap queue failed\n"); + dev_err(dev, "unmap queue failed\n"); goto out_unlock; } } else if (prev_active && @@ -894,7 +903,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q, KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN), KFD_UNMAP_LATENCY_MS, q->pipe, q->queue); if (retval) { - pr_err("destroy mqd failed\n"); + dev_err(dev, "destroy mqd failed\n"); goto out_unlock; } } @@ -1088,6 +1097,7 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { struct queue *q; + struct device *dev = dqm->dev->adev->dev; struct kfd_process_device *pdd; int retval = 0; @@ -1121,7 +1131,7 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm, if (dqm->dev->kfd->shared_resources.enable_mes) { retval = remove_queue_mes(dqm, q, qpd); if (retval) { - pr_err("Failed to evict queue %d\n", + dev_err(dev, "Failed to evict queue %d\n", q->properties.queue_id); goto out; } @@ -1225,6 +1235,7 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { struct queue *q; + struct device *dev = dqm->dev->adev->dev; struct kfd_process_device *pdd; uint64_t eviction_duration; int retval = 0; @@ -1265,7 +1276,7 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm, if (dqm->dev->kfd->shared_resources.enable_mes) { retval = add_queue_mes(dqm, q, qpd); if (retval) { - pr_err("Failed to restore queue %d\n", + dev_err(dev, "Failed to restore queue %d\n", q->properties.queue_id); goto out; } @@ -1474,18 +1485,19 @@ static void pre_reset(struct device_queue_manager *dqm) static int allocate_sdma_queue(struct device_queue_manager *dqm, struct queue *q, const uint32_t *restore_sdma_id) { + struct device *dev = dqm->dev->adev->dev; int bit; if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { if (bitmap_empty(dqm->sdma_bitmap, KFD_MAX_SDMA_QUEUES)) { - pr_err("No more SDMA queue to allocate\n"); + dev_err(dev, "No more SDMA queue to allocate\n"); return -ENOMEM; } if (restore_sdma_id) { /* Re-use existing sdma_id */ if (!test_bit(*restore_sdma_id, dqm->sdma_bitmap)) { - pr_err("SDMA queue already in use\n"); + dev_err(dev, "SDMA queue already in use\n"); return -EBUSY; } clear_bit(*restore_sdma_id, dqm->sdma_bitmap); @@ -1504,13 +1516,13 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm, kfd_get_num_sdma_engines(dqm->dev); } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { if (bitmap_empty(dqm->xgmi_sdma_bitmap, KFD_MAX_SDMA_QUEUES)) { - pr_err("No more XGMI SDMA queue to allocate\n"); + dev_err(dev, "No more XGMI SDMA queue to allocate\n"); return -ENOMEM; } if (restore_sdma_id) { /* Re-use existing sdma_id */ if (!test_bit(*restore_sdma_id, dqm->xgmi_sdma_bitmap)) { - pr_err("SDMA queue already in use\n"); + dev_err(dev, "SDMA queue already in use\n"); return -EBUSY; } clear_bit(*restore_sdma_id, dqm->xgmi_sdma_bitmap); @@ -1562,11 +1574,12 @@ static int set_sched_resources(struct device_queue_manager *dqm) { int i, mec; struct scheduling_resources res; + struct device *dev = dqm->dev->adev->dev; res.vmid_mask = dqm->dev->compute_vmid_bitmap; res.queue_mask = 0; - for (i = 0; i < KGD_MAX_QUEUES; ++i) { + for (i = 0; i < AMDGPU_MAX_QUEUES; ++i) { mec = (i / dqm->dev->kfd->shared_resources.num_queue_per_pipe) / dqm->dev->kfd->shared_resources.num_pipe_per_mec; @@ -1582,7 +1595,7 @@ static int set_sched_resources(struct device_queue_manager *dqm) * definition of res.queue_mask needs updating */ if (WARN_ON(i >= (sizeof(res.queue_mask)*8))) { - pr_err("Invalid queue enabled by amdgpu: %d\n", i); + dev_err(dev, "Invalid queue enabled by amdgpu: %d\n", i); break; } @@ -1625,6 +1638,7 @@ static int initialize_cpsch(struct device_queue_manager *dqm) static int start_cpsch(struct device_queue_manager *dqm) { + struct device *dev = dqm->dev->adev->dev; int retval; retval = 0; @@ -1671,7 +1685,7 @@ static int start_cpsch(struct device_queue_manager *dqm) retval = pm_update_grace_period(&dqm->packet_mgr, grace_period); if (retval) - pr_err("Setting grace timeout failed\n"); + dev_err(dev, "Setting grace timeout failed\n"); else if (dqm->dev->kfd2kgd->build_grace_period_packet_info) /* Update dqm->wait_times maintained in software */ dqm->dev->kfd2kgd->build_grace_period_packet_info( @@ -1880,15 +1894,17 @@ out: return retval; } -int amdkfd_fence_wait_timeout(uint64_t *fence_addr, - uint64_t fence_value, - unsigned int timeout_ms) +int amdkfd_fence_wait_timeout(struct device_queue_manager *dqm, + uint64_t fence_value, + unsigned int timeout_ms) { unsigned long end_jiffies = msecs_to_jiffies(timeout_ms) + jiffies; + struct device *dev = dqm->dev->adev->dev; + uint64_t *fence_addr = dqm->fence_addr; while (*fence_addr != fence_value) { if (time_after(jiffies, end_jiffies)) { - pr_err("qcm fence wait loop timeout expired\n"); + dev_err(dev, "qcm fence wait loop timeout expired\n"); /* In HWS case, this is used to halt the driver thread * in order not to mess up CP states before doing * scandumps for FW debugging. @@ -1907,6 +1923,7 @@ int amdkfd_fence_wait_timeout(uint64_t *fence_addr, /* dqm->lock mutex has to be locked before calling this function */ static int map_queues_cpsch(struct device_queue_manager *dqm) { + struct device *dev = dqm->dev->adev->dev; int retval; if (!dqm->sched_running) @@ -1919,7 +1936,7 @@ static int map_queues_cpsch(struct device_queue_manager *dqm) retval = pm_send_runlist(&dqm->packet_mgr, &dqm->queues); pr_debug("%s sent runlist\n", __func__); if (retval) { - pr_err("failed to execute runlist\n"); + dev_err(dev, "failed to execute runlist\n"); return retval; } dqm->active_runlist = true; @@ -1934,8 +1951,9 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, uint32_t grace_period, bool reset) { - int retval = 0; + struct device *dev = dqm->dev->adev->dev; struct mqd_manager *mqd_mgr; + int retval = 0; if (!dqm->sched_running) return 0; @@ -1958,10 +1976,10 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, pm_send_query_status(&dqm->packet_mgr, dqm->fence_gpu_addr, KFD_FENCE_COMPLETED); /* should be timed out */ - retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED, - queue_preemption_timeout_ms); + retval = amdkfd_fence_wait_timeout(dqm, KFD_FENCE_COMPLETED, + queue_preemption_timeout_ms); if (retval) { - pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n"); + dev_err(dev, "The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n"); kfd_hws_hang(dqm); return retval; } @@ -1976,7 +1994,7 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, */ mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]; if (mqd_mgr->read_doorbell_id(dqm->packet_mgr.priv_queue->queue->mqd)) { - pr_err("HIQ MQD's queue_doorbell_id0 is not 0, Queue preemption time out\n"); + dev_err(dev, "HIQ MQD's queue_doorbell_id0 is not 0, Queue preemption time out\n"); while (halt_if_hws_hang) schedule(); return -ETIME; @@ -1986,7 +2004,7 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, if (grace_period != USE_DEFAULT_GRACE_PERIOD) { if (pm_update_grace_period(&dqm->packet_mgr, USE_DEFAULT_GRACE_PERIOD)) - pr_err("Failed to reset grace period\n"); + dev_err(dev, "Failed to reset grace period\n"); } pm_release_ib(&dqm->packet_mgr); @@ -2060,6 +2078,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, struct mqd_manager *mqd_mgr; uint64_t sdma_val = 0; struct kfd_process_device *pdd = qpd_to_pdd(qpd); + struct device *dev = dqm->dev->adev->dev; /* Get the SDMA queue stats */ if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) || @@ -2067,7 +2086,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, retval = read_sdma_queue_counter((uint64_t __user *)q->properties.read_ptr, &sdma_val); if (retval) - pr_err("Failed to read SDMA queue counter for queue: %d\n", + dev_err(dev, "Failed to read SDMA queue counter for queue: %d\n", q->properties.queue_id); } @@ -2348,6 +2367,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm, { int retval; struct queue *q; + struct device *dev = dqm->dev->adev->dev; struct kernel_queue *kq, *kq_next; struct mqd_manager *mqd_mgr; struct device_process_node *cur, *next_dpn; @@ -2381,7 +2401,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm, if (dqm->dev->kfd->shared_resources.enable_mes) { retval = remove_queue_mes(dqm, q, qpd); if (retval) - pr_err("Failed to remove queue %d\n", + dev_err(dev, "Failed to remove queue %d\n", q->properties.queue_id); } } @@ -2436,12 +2456,13 @@ static int process_termination_cpsch(struct device_queue_manager *dqm, static int init_mqd_managers(struct device_queue_manager *dqm) { int i, j; + struct device *dev = dqm->dev->adev->dev; struct mqd_manager *mqd_mgr; for (i = 0; i < KFD_MQD_TYPE_MAX; i++) { mqd_mgr = dqm->asic_ops.mqd_manager_init(i, dqm->dev); if (!mqd_mgr) { - pr_err("mqd manager [%d] initialization failed\n", i); + dev_err(dev, "mqd manager [%d] initialization failed\n", i); goto out_free; } dqm->mqd_mgrs[i] = mqd_mgr; @@ -2551,7 +2572,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_node *dev) dqm->ops.checkpoint_mqd = checkpoint_mqd; break; default: - pr_err("Invalid scheduling policy %d\n", dqm->sched_policy); + dev_err(dev->adev->dev, "Invalid scheduling policy %d\n", dqm->sched_policy); goto out_free; } @@ -2589,7 +2610,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_node *dev) goto out_free; if (!dev->kfd->shared_resources.enable_mes && allocate_hiq_sdma_mqd(dqm)) { - pr_err("Failed to allocate hiq sdma mqd trunk buffer\n"); + dev_err(dev->adev->dev, "Failed to allocate hiq sdma mqd trunk buffer\n"); goto out_free; } @@ -2648,17 +2669,18 @@ int reserve_debug_trap_vmid(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { int r; + struct device *dev = dqm->dev->adev->dev; int updated_vmid_mask; if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) { - pr_err("Unsupported on sched_policy: %i\n", dqm->sched_policy); + dev_err(dev, "Unsupported on sched_policy: %i\n", dqm->sched_policy); return -EINVAL; } dqm_lock(dqm); if (dqm->trap_debug_vmid != 0) { - pr_err("Trap debug id already reserved\n"); + dev_err(dev, "Trap debug id already reserved\n"); r = -EBUSY; goto out_unlock; } @@ -2694,19 +2716,20 @@ out_unlock: int release_debug_trap_vmid(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { + struct device *dev = dqm->dev->adev->dev; int r; int updated_vmid_mask; uint32_t trap_debug_vmid; if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) { - pr_err("Unsupported on sched_policy: %i\n", dqm->sched_policy); + dev_err(dev, "Unsupported on sched_policy: %i\n", dqm->sched_policy); return -EINVAL; } dqm_lock(dqm); trap_debug_vmid = dqm->trap_debug_vmid; if (dqm->trap_debug_vmid == 0) { - pr_err("Trap debug id is not reserved\n"); + dev_err(dev, "Trap debug id is not reserved\n"); r = -EINVAL; goto out_unlock; } @@ -2843,6 +2866,7 @@ int resume_queues(struct kfd_process *p, for (i = 0; i < p->n_pdds; i++) { struct kfd_process_device *pdd = p->pdds[i]; struct device_queue_manager *dqm = pdd->dev->dqm; + struct device *dev = dqm->dev->adev->dev; struct qcm_process_device *qpd = &pdd->qpd; struct queue *q; int r, per_device_resumed = 0; @@ -2893,7 +2917,7 @@ int resume_queues(struct kfd_process *p, 0, USE_DEFAULT_GRACE_PERIOD); if (r) { - pr_err("Failed to resume process queues\n"); + dev_err(dev, "Failed to resume process queues\n"); if (queue_ids) { list_for_each_entry(q, &qpd->queues_list, list) { int q_idx = q_array_get_index( @@ -2945,6 +2969,7 @@ int suspend_queues(struct kfd_process *p, for (i = 0; i < p->n_pdds; i++) { struct kfd_process_device *pdd = p->pdds[i]; struct device_queue_manager *dqm = pdd->dev->dqm; + struct device *dev = dqm->dev->adev->dev; struct qcm_process_device *qpd = &pdd->qpd; struct queue *q; int r, per_device_suspended = 0; @@ -2993,7 +3018,7 @@ int suspend_queues(struct kfd_process *p, grace_period); if (r) - pr_err("Failed to suspend process queues.\n"); + dev_err(dev, "Failed to suspend process queues.\n"); else total_suspended += per_device_suspended; @@ -3080,10 +3105,11 @@ void set_queue_snapshot_entry(struct queue *q, int debug_lock_and_unmap(struct device_queue_manager *dqm) { + struct device *dev = dqm->dev->adev->dev; int r; if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) { - pr_err("Unsupported on sched_policy: %i\n", dqm->sched_policy); + dev_err(dev, "Unsupported on sched_policy: %i\n", dqm->sched_policy); return -EINVAL; } @@ -3101,10 +3127,11 @@ int debug_lock_and_unmap(struct device_queue_manager *dqm) int debug_map_and_unlock(struct device_queue_manager *dqm) { + struct device *dev = dqm->dev->adev->dev; int r; if (dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) { - pr_err("Unsupported on sched_policy: %i\n", dqm->sched_policy); + dev_err(dev, "Unsupported on sched_policy: %i\n", dqm->sched_policy); return -EINVAL; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c index 7b38537c7c99..05c74887fd6f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c @@ -161,7 +161,10 @@ void __iomem *kfd_get_kernel_doorbell(struct kfd_dev *kfd, if (inx >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) return NULL; - *doorbell_off = amdgpu_doorbell_index_on_bar(kfd->adev, kfd->doorbells, inx); + *doorbell_off = amdgpu_doorbell_index_on_bar(kfd->adev, + kfd->doorbells, + inx, + kfd->device_info.doorbell_size); inx *= 2; pr_debug("Get kernel queue doorbell\n" @@ -240,7 +243,10 @@ phys_addr_t kfd_get_process_doorbells(struct kfd_process_device *pdd) return 0; } - first_db_index = amdgpu_doorbell_index_on_bar(adev, pdd->qpd.proc_doorbells, 0); + first_db_index = amdgpu_doorbell_index_on_bar(adev, + pdd->qpd.proc_doorbells, + 0, + pdd->dev->kfd->device_info.doorbell_size); return adev->doorbell.base + first_db_index * sizeof(uint32_t); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c index c7991e07b6be..a7697ec8188e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c @@ -268,7 +268,7 @@ static void event_interrupt_wq_v10(struct kfd_node *dev, SQ_INTERRUPT_WORD_WAVE_CTXID1, ENCODING); switch (encoding) { case SQ_INTERRUPT_WORD_ENCODING_AUTO: - pr_debug( + pr_debug_ratelimited( "sq_intr: auto, se %d, ttrace %d, wlt %d, ttrac_buf0_full %d, ttrac_buf1_full %d, ttrace_utc_err %d\n", REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_AUTO_CTXID1, SE_ID), @@ -284,7 +284,7 @@ static void event_interrupt_wq_v10(struct kfd_node *dev, THREAD_TRACE_UTC_ERROR)); break; case SQ_INTERRUPT_WORD_ENCODING_INST: - pr_debug("sq_intr: inst, se %d, data 0x%x, sa %d, priv %d, wave_id %d, simd_id %d, wgp_id %d\n", + pr_debug_ratelimited("sq_intr: inst, se %d, data 0x%x, sa %d, priv %d, wave_id %d, simd_id %d, wgp_id %d\n", REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, SE_ID), REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, @@ -310,7 +310,7 @@ static void event_interrupt_wq_v10(struct kfd_node *dev, case SQ_INTERRUPT_WORD_ENCODING_ERROR: sq_intr_err_type = REG_GET_FIELD(context_id0, KFD_CTXID0, ERR_TYPE); - pr_warn("sq_intr: error, se %d, data 0x%x, sa %d, priv %d, wave_id %d, simd_id %d, wgp_id %d, err_type %d\n", + pr_warn_ratelimited("sq_intr: error, se %d, data 0x%x, sa %d, priv %d, wave_id %d, simd_id %d, wgp_id %d, err_type %d\n", REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, SE_ID), REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c index f933bd231fb9..2a65792fd116 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c @@ -150,7 +150,7 @@ enum SQ_INTERRUPT_ERROR_TYPE { static void print_sq_intr_info_auto(uint32_t context_id0, uint32_t context_id1) { - pr_debug( + pr_debug_ratelimited( "sq_intr: auto, ttrace %d, wlt %d, ttrace_buf_full %d, reg_tms %d, cmd_tms %d, host_cmd_ovf %d, host_reg_ovf %d, immed_ovf %d, ttrace_utc_err %d\n", REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, THREAD_TRACE), REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, WLT), @@ -165,7 +165,7 @@ static void print_sq_intr_info_auto(uint32_t context_id0, uint32_t context_id1) static void print_sq_intr_info_inst(uint32_t context_id0, uint32_t context_id1) { - pr_debug( + pr_debug_ratelimited( "sq_intr: inst, data 0x%08x, sh %d, priv %d, wave_id %d, simd_id %d, wgp_id %d\n", REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, DATA), REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, SH_ID), @@ -177,7 +177,7 @@ static void print_sq_intr_info_inst(uint32_t context_id0, uint32_t context_id1) static void print_sq_intr_info_error(uint32_t context_id0, uint32_t context_id1) { - pr_warn( + pr_warn_ratelimited( "sq_intr: error, detail 0x%08x, type %d, sh %d, priv %d, wave_id %d, simd_id %d, wgp_id %d\n", REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, DETAIL), REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, TYPE), diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c index 830396b1c3b1..27cdaea40501 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c @@ -333,7 +333,7 @@ static void event_interrupt_wq_v9(struct kfd_node *dev, encoding = REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, ENCODING); switch (encoding) { case SQ_INTERRUPT_WORD_ENCODING_AUTO: - pr_debug( + pr_debug_ratelimited( "sq_intr: auto, se %d, ttrace %d, wlt %d, ttrac_buf_full %d, reg_tms %d, cmd_tms %d, host_cmd_ovf %d, host_reg_ovf %d, immed_ovf %d, ttrace_utc_err %d\n", REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, SE_ID), REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, THREAD_TRACE), @@ -347,7 +347,7 @@ static void event_interrupt_wq_v9(struct kfd_node *dev, REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID, THREAD_TRACE_UTC_ERROR)); break; case SQ_INTERRUPT_WORD_ENCODING_INST: - pr_debug("sq_intr: inst, se %d, data 0x%x, sh %d, priv %d, wave_id %d, simd_id %d, cu_id %d, intr_data 0x%x\n", + pr_debug_ratelimited("sq_intr: inst, se %d, data 0x%x, sh %d, priv %d, wave_id %d, simd_id %d, cu_id %d, intr_data 0x%x\n", REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SE_ID), REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, DATA), REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SH_ID), @@ -366,7 +366,7 @@ static void event_interrupt_wq_v9(struct kfd_node *dev, break; case SQ_INTERRUPT_WORD_ENCODING_ERROR: sq_intr_err = REG_GET_FIELD(sq_int_data, KFD_SQ_INT_DATA, ERR_TYPE); - pr_warn("sq_intr: error, se %d, data 0x%x, sh %d, priv %d, wave_id %d, simd_id %d, cu_id %d, err_type %d\n", + pr_warn_ratelimited("sq_intr: error, se %d, data 0x%x, sh %d, priv %d, wave_id %d, simd_id %d, cu_id %d, err_type %d\n", REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SE_ID), REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, DATA), REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SH_ID), diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index 7d82c7da223a..81d25a679427 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -442,10 +442,10 @@ svm_migrate_vma_to_vram(struct kfd_node *node, struct svm_range *prange, goto out_free; } if (cpages != npages) - pr_debug("partial migration, 0x%lx/0x%llx pages migrated\n", + pr_debug("partial migration, 0x%lx/0x%llx pages collected\n", cpages, npages); else - pr_debug("0x%lx pages migrated\n", cpages); + pr_debug("0x%lx pages collected\n", cpages); r = svm_migrate_copy_to_vram(node, prange, &migrate, &mfence, scratch, ttm_res_offset); migrate_vma_pages(&migrate); @@ -460,7 +460,7 @@ svm_migrate_vma_to_vram(struct kfd_node *node, struct svm_range *prange, start >> PAGE_SHIFT, end >> PAGE_SHIFT, 0, node->id, trigger); - svm_range_dma_unmap(adev->dev, scratch, 0, npages); + svm_range_dma_unmap_dev(adev->dev, scratch, 0, npages); out_free: kvfree(buf); @@ -479,6 +479,8 @@ out: * svm_migrate_ram_to_vram - migrate svm range from system to device * @prange: range structure * @best_loc: the device to migrate to + * @start_mgr: start page to migrate + * @last_mgr: last page to migrate * @mm: the process mm structure * @trigger: reason of migration * @@ -489,6 +491,7 @@ out: */ static int svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc, + unsigned long start_mgr, unsigned long last_mgr, struct mm_struct *mm, uint32_t trigger) { unsigned long addr, start, end; @@ -498,23 +501,30 @@ svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc, unsigned long cpages = 0; long r = 0; - if (prange->actual_loc == best_loc) { - pr_debug("svms 0x%p [0x%lx 0x%lx] already on best_loc 0x%x\n", - prange->svms, prange->start, prange->last, best_loc); + if (!best_loc) { + pr_debug("svms 0x%p [0x%lx 0x%lx] migrate to sys ram\n", + prange->svms, start_mgr, last_mgr); return 0; } + if (start_mgr < prange->start || last_mgr > prange->last) { + pr_debug("range [0x%lx 0x%lx] out prange [0x%lx 0x%lx]\n", + start_mgr, last_mgr, prange->start, prange->last); + return -EFAULT; + } + node = svm_range_get_node_by_id(prange, best_loc); if (!node) { pr_debug("failed to get kfd node by id 0x%x\n", best_loc); return -ENODEV; } - pr_debug("svms 0x%p [0x%lx 0x%lx] to gpu 0x%x\n", prange->svms, - prange->start, prange->last, best_loc); + pr_debug("svms 0x%p [0x%lx 0x%lx] in [0x%lx 0x%lx] to gpu 0x%x\n", + prange->svms, start_mgr, last_mgr, prange->start, prange->last, + best_loc); - start = prange->start << PAGE_SHIFT; - end = (prange->last + 1) << PAGE_SHIFT; + start = start_mgr << PAGE_SHIFT; + end = (last_mgr + 1) << PAGE_SHIFT; r = svm_range_vram_node_new(node, prange, true); if (r) { @@ -544,8 +554,11 @@ svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc, if (cpages) { prange->actual_loc = best_loc; - svm_range_free_dma_mappings(prange, true); - } else { + prange->vram_pages = prange->vram_pages + cpages; + } else if (!prange->actual_loc) { + /* if no page migrated and all pages from prange are at + * sys ram drop svm_bo got from svm_range_vram_node_new + */ svm_range_vram_node_free(prange); } @@ -663,9 +676,8 @@ out_oom: * Context: Process context, caller hold mmap read lock, prange->migrate_mutex * * Return: - * 0 - success with all pages migrated * negative values - indicate error - * positive values - partial migration, number of pages not migrated + * positive values or zero - number of pages got migrated */ static long svm_migrate_vma_to_ram(struct kfd_node *node, struct svm_range *prange, @@ -676,6 +688,7 @@ svm_migrate_vma_to_ram(struct kfd_node *node, struct svm_range *prange, uint64_t npages = (end - start) >> PAGE_SHIFT; unsigned long upages = npages; unsigned long cpages = 0; + unsigned long mpages = 0; struct amdgpu_device *adev = node->adev; struct kfd_process_device *pdd; struct dma_fence *mfence = NULL; @@ -725,10 +738,10 @@ svm_migrate_vma_to_ram(struct kfd_node *node, struct svm_range *prange, goto out_free; } if (cpages != npages) - pr_debug("partial migration, 0x%lx/0x%llx pages migrated\n", + pr_debug("partial migration, 0x%lx/0x%llx pages collected\n", cpages, npages); else - pr_debug("0x%lx pages migrated\n", cpages); + pr_debug("0x%lx pages collected\n", cpages); r = svm_migrate_copy_to_ram(adev, prange, &migrate, &mfence, scratch, npages); @@ -745,23 +758,27 @@ svm_migrate_vma_to_ram(struct kfd_node *node, struct svm_range *prange, start >> PAGE_SHIFT, end >> PAGE_SHIFT, node->id, 0, trigger); - svm_range_dma_unmap(adev->dev, scratch, 0, npages); + svm_range_dma_unmap_dev(adev->dev, scratch, 0, npages); out_free: kvfree(buf); out: if (!r && cpages) { + mpages = cpages - upages; pdd = svm_range_get_pdd_by_node(prange, node); if (pdd) - WRITE_ONCE(pdd->page_out, pdd->page_out + cpages); + WRITE_ONCE(pdd->page_out, pdd->page_out + mpages); } - return r ? r : upages; + + return r ? r : mpages; } /** * svm_migrate_vram_to_ram - migrate svm range from device to system * @prange: range structure * @mm: process mm, use current->mm if NULL + * @start_mgr: start page need be migrated to sys ram + * @last_mgr: last page need be migrated to sys ram * @trigger: reason of migration * @fault_page: is from vmf->page, svm_migrate_to_ram(), this is CPU page fault callback * @@ -771,6 +788,7 @@ out: * 0 - OK, otherwise error code */ int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm, + unsigned long start_mgr, unsigned long last_mgr, uint32_t trigger, struct page *fault_page) { struct kfd_node *node; @@ -778,26 +796,33 @@ int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm, unsigned long addr; unsigned long start; unsigned long end; - unsigned long upages = 0; + unsigned long mpages = 0; long r = 0; + /* this pragne has no any vram page to migrate to sys ram */ if (!prange->actual_loc) { pr_debug("[0x%lx 0x%lx] already migrated to ram\n", prange->start, prange->last); return 0; } + if (start_mgr < prange->start || last_mgr > prange->last) { + pr_debug("range [0x%lx 0x%lx] out prange [0x%lx 0x%lx]\n", + start_mgr, last_mgr, prange->start, prange->last); + return -EFAULT; + } + node = svm_range_get_node_by_id(prange, prange->actual_loc); if (!node) { pr_debug("failed to get kfd node by id 0x%x\n", prange->actual_loc); return -ENODEV; } pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] from gpu 0x%x to ram\n", - prange->svms, prange, prange->start, prange->last, + prange->svms, prange, start_mgr, last_mgr, prange->actual_loc); - start = prange->start << PAGE_SHIFT; - end = (prange->last + 1) << PAGE_SHIFT; + start = start_mgr << PAGE_SHIFT; + end = (last_mgr + 1) << PAGE_SHIFT; for (addr = start; addr < end;) { unsigned long next; @@ -816,14 +841,21 @@ int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm, pr_debug("failed %ld to migrate prange %p\n", r, prange); break; } else { - upages += r; + mpages += r; } addr = next; } - if (r >= 0 && !upages) { - svm_range_vram_node_free(prange); - prange->actual_loc = 0; + if (r >= 0) { + prange->vram_pages -= mpages; + + /* prange does not have vram page set its actual_loc to system + * and drop its svm_bo ref + */ + if (prange->vram_pages == 0 && prange->ttm_res) { + prange->actual_loc = 0; + svm_range_vram_node_free(prange); + } } return r < 0 ? r : 0; @@ -833,17 +865,23 @@ int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm, * svm_migrate_vram_to_vram - migrate svm range from device to device * @prange: range structure * @best_loc: the device to migrate to + * @start: start page need be migrated to sys ram + * @last: last page need be migrated to sys ram * @mm: process mm, use current->mm if NULL * @trigger: reason of migration * * Context: Process context, caller hold mmap read lock, svms lock, prange lock * + * migrate all vram pages in prange to sys ram, then migrate + * [start, last] pages from sys ram to gpu node best_loc. + * * Return: * 0 - OK, otherwise error code */ static int svm_migrate_vram_to_vram(struct svm_range *prange, uint32_t best_loc, - struct mm_struct *mm, uint32_t trigger) + unsigned long start, unsigned long last, + struct mm_struct *mm, uint32_t trigger) { int r, retries = 3; @@ -855,7 +893,8 @@ svm_migrate_vram_to_vram(struct svm_range *prange, uint32_t best_loc, pr_debug("from gpu 0x%x to gpu 0x%x\n", prange->actual_loc, best_loc); do { - r = svm_migrate_vram_to_ram(prange, mm, trigger, NULL); + r = svm_migrate_vram_to_ram(prange, mm, prange->start, prange->last, + trigger, NULL); if (r) return r; } while (prange->actual_loc && --retries); @@ -863,17 +902,21 @@ svm_migrate_vram_to_vram(struct svm_range *prange, uint32_t best_loc, if (prange->actual_loc) return -EDEADLK; - return svm_migrate_ram_to_vram(prange, best_loc, mm, trigger); + return svm_migrate_ram_to_vram(prange, best_loc, start, last, mm, trigger); } int svm_migrate_to_vram(struct svm_range *prange, uint32_t best_loc, - struct mm_struct *mm, uint32_t trigger) + unsigned long start, unsigned long last, + struct mm_struct *mm, uint32_t trigger) { - if (!prange->actual_loc) - return svm_migrate_ram_to_vram(prange, best_loc, mm, trigger); + if (!prange->actual_loc || prange->actual_loc == best_loc) + return svm_migrate_ram_to_vram(prange, best_loc, start, last, + mm, trigger); + else - return svm_migrate_vram_to_vram(prange, best_loc, mm, trigger); + return svm_migrate_vram_to_vram(prange, best_loc, start, last, + mm, trigger); } @@ -889,10 +932,9 @@ svm_migrate_to_vram(struct svm_range *prange, uint32_t best_loc, */ static vm_fault_t svm_migrate_to_ram(struct vm_fault *vmf) { + unsigned long start, last, size; unsigned long addr = vmf->address; struct svm_range_bo *svm_bo; - enum svm_work_list_ops op; - struct svm_range *parent; struct svm_range *prange; struct kfd_process *p; struct mm_struct *mm; @@ -929,51 +971,31 @@ static vm_fault_t svm_migrate_to_ram(struct vm_fault *vmf) mutex_lock(&p->svms.lock); - prange = svm_range_from_addr(&p->svms, addr, &parent); + prange = svm_range_from_addr(&p->svms, addr, NULL); if (!prange) { pr_debug("failed get range svms 0x%p addr 0x%lx\n", &p->svms, addr); r = -EFAULT; goto out_unlock_svms; } - mutex_lock(&parent->migrate_mutex); - if (prange != parent) - mutex_lock_nested(&prange->migrate_mutex, 1); + mutex_lock(&prange->migrate_mutex); if (!prange->actual_loc) goto out_unlock_prange; - svm_range_lock(parent); - if (prange != parent) - mutex_lock_nested(&prange->lock, 1); - r = svm_range_split_by_granularity(p, mm, addr, parent, prange); - if (prange != parent) - mutex_unlock(&prange->lock); - svm_range_unlock(parent); - if (r) { - pr_debug("failed %d to split range by granularity\n", r); - goto out_unlock_prange; - } + /* Align migration range start and size to granularity size */ + size = 1UL << prange->granularity; + start = max(ALIGN_DOWN(addr, size), prange->start); + last = min(ALIGN(addr + 1, size) - 1, prange->last); - r = svm_migrate_vram_to_ram(prange, vmf->vma->vm_mm, - KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU, - vmf->page); + r = svm_migrate_vram_to_ram(prange, vmf->vma->vm_mm, start, last, + KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU, vmf->page); if (r) pr_debug("failed %d migrate svms 0x%p range 0x%p [0x%lx 0x%lx]\n", - r, prange->svms, prange, prange->start, prange->last); - - /* xnack on, update mapping on GPUs with ACCESS_IN_PLACE */ - if (p->xnack_enabled && parent == prange) - op = SVM_OP_UPDATE_RANGE_NOTIFIER_AND_MAP; - else - op = SVM_OP_UPDATE_RANGE_NOTIFIER; - svm_range_add_list_work(&p->svms, parent, mm, op); - schedule_deferred_list_work(&p->svms); + r, prange->svms, prange, start, last); out_unlock_prange: - if (prange != parent) - mutex_unlock(&prange->migrate_mutex); - mutex_unlock(&parent->migrate_mutex); + mutex_unlock(&prange->migrate_mutex); out_unlock_svms: mutex_unlock(&p->svms.lock); out_unref_process: @@ -1001,7 +1023,7 @@ int kgd2kfd_init_zone_device(struct amdgpu_device *adev) void *r; /* Page migration works on gfx9 or newer */ - if (adev->ip_versions[GC_HWIP][0] < IP_VERSION(9, 0, 1)) + if (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(9, 0, 1)) return -EINVAL; if (adev->gmc.is_app_apu) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h index 487f26368164..9e48d10e848e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h @@ -41,9 +41,13 @@ enum MIGRATION_COPY_DIR { }; int svm_migrate_to_vram(struct svm_range *prange, uint32_t best_loc, + unsigned long start, unsigned long last, struct mm_struct *mm, uint32_t trigger); + int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm, - uint32_t trigger, struct page *fault_page); + unsigned long start, unsigned long last, + uint32_t trigger, struct page *fault_page); + unsigned long svm_migrate_addr_to_pfn(struct amdgpu_device *adev, unsigned long addr); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c index 447829c22295..050a6936ff84 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c @@ -99,7 +99,8 @@ void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm, const uint32_t *cu_mask, uint32_t cu_mask_count, uint32_t *se_mask, uint32_t inst) { - struct kfd_cu_info cu_info; + struct amdgpu_cu_info *cu_info = &mm->dev->adev->gfx.cu_info; + struct amdgpu_gfx_config *gfx_info = &mm->dev->adev->gfx.config; uint32_t cu_per_sh[KFD_MAX_NUM_SE][KFD_MAX_NUM_SH_PER_SE] = {0}; bool wgp_mode_req = KFD_GC_VERSION(mm->dev) >= IP_VERSION(10, 0, 0); uint32_t en_mask = wgp_mode_req ? 0x3 : 0x1; @@ -108,9 +109,7 @@ void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm, int inc = cu_inc * NUM_XCC(mm->dev->xcc_mask); int xcc_inst = inst + ffs(mm->dev->xcc_mask) - 1; - amdgpu_amdkfd_get_cu_info(mm->dev->adev, &cu_info); - - cu_active_per_node = cu_info.cu_active_number / mm->dev->kfd->num_nodes; + cu_active_per_node = cu_info->number / mm->dev->kfd->num_nodes; if (cu_mask_count > cu_active_per_node) cu_mask_count = cu_active_per_node; @@ -118,13 +117,14 @@ void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm, * Returning with no CU's enabled will hang the queue, which should be * attention grabbing. */ - if (cu_info.num_shader_engines > KFD_MAX_NUM_SE) { - pr_err("Exceeded KFD_MAX_NUM_SE, chip reports %d\n", cu_info.num_shader_engines); + if (gfx_info->max_shader_engines > KFD_MAX_NUM_SE) { + pr_err("Exceeded KFD_MAX_NUM_SE, chip reports %d\n", + gfx_info->max_shader_engines); return; } - if (cu_info.num_shader_arrays_per_engine > KFD_MAX_NUM_SH_PER_SE) { + if (gfx_info->max_sh_per_se > KFD_MAX_NUM_SH_PER_SE) { pr_err("Exceeded KFD_MAX_NUM_SH, chip reports %d\n", - cu_info.num_shader_arrays_per_engine * cu_info.num_shader_engines); + gfx_info->max_sh_per_se * gfx_info->max_shader_engines); return; } @@ -142,10 +142,10 @@ void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm, * See note on Arcturus cu_bitmap layout in gfx_v9_0_get_cu_info. * See note on GFX11 cu_bitmap layout in gfx_v11_0_get_cu_info. */ - for (se = 0; se < cu_info.num_shader_engines; se++) - for (sh = 0; sh < cu_info.num_shader_arrays_per_engine; sh++) + for (se = 0; se < gfx_info->max_shader_engines; se++) + for (sh = 0; sh < gfx_info->max_sh_per_se; sh++) cu_per_sh[se][sh] = hweight32( - cu_info.cu_bitmap[xcc_inst][se % 4][sh + (se / 4) * + cu_info->bitmap[xcc_inst][se % 4][sh + (se / 4) * cu_bitmap_sh_mul]); /* Symmetrically map cu_mask to all SEs & SHs: @@ -184,13 +184,13 @@ void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm, * * First ensure all CUs are disabled, then enable user specified CUs. */ - for (i = 0; i < cu_info.num_shader_engines; i++) + for (i = 0; i < gfx_info->max_shader_engines; i++) se_mask[i] = 0; i = inst; for (cu = 0; cu < 16; cu += cu_inc) { - for (sh = 0; sh < cu_info.num_shader_arrays_per_engine; sh++) { - for (se = 0; se < cu_info.num_shader_engines; se++) { + for (sh = 0; sh < gfx_info->max_sh_per_se; sh++) { + for (se = 0; se < gfx_info->max_shader_engines; se++) { if (cu_per_sh[se][sh] > cu) { if (cu_mask[i / 32] & (en_mask << (i % 32))) se_mask[se] |= en_mask << (cu + sh * 16); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c index 1a03173e2313..8ee2bedd301a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c @@ -205,7 +205,8 @@ static int pm_set_resources_v9(struct packet_manager *pm, uint32_t *buffer, static inline bool pm_use_ext_eng(struct kfd_dev *dev) { - return dev->adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 2, 0); + return amdgpu_ip_version(dev->adev, SDMA0_HWIP, 0) >= + IP_VERSION(5, 2, 0); } static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index fa24e1852493..9cc32f577e38 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -202,7 +202,7 @@ enum cache_policy { cache_policy_noncoherent }; -#define KFD_GC_VERSION(dev) ((dev)->adev->ip_versions[GC_HWIP][0]) +#define KFD_GC_VERSION(dev) (amdgpu_ip_version((dev)->adev, GC_HWIP, 0)) #define KFD_IS_SOC15(dev) ((KFD_GC_VERSION(dev)) >= (IP_VERSION(9, 0, 1))) #define KFD_SUPPORT_XNACK_PER_PROCESS(dev)\ ((KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2)) || \ @@ -1343,7 +1343,7 @@ int pqm_get_queue_snapshot(struct process_queue_manager *pqm, int *num_qss_entries, uint32_t *entry_size); -int amdkfd_fence_wait_timeout(uint64_t *fence_addr, +int amdkfd_fence_wait_timeout(struct device_queue_manager *dqm, uint64_t fence_value, unsigned int timeout_ms); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index adb5e4bdc0b2..77649392e233 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -377,7 +377,8 @@ int pqm_create_queue(struct process_queue_manager *pqm, */ uint32_t first_db_index = amdgpu_doorbell_index_on_bar(pdd->dev->adev, pdd->qpd.proc_doorbells, - 0); + 0, + pdd->dev->kfd->device_info.doorbell_size); *p_doorbell_offset_in_process = (q->properties.doorbell_off - first_db_index) * sizeof(uint32_t); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index bb16b795d1bc..f4038b33c404 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -158,12 +158,13 @@ svm_is_valid_dma_mapping_addr(struct device *dev, dma_addr_t dma_addr) static int svm_range_dma_map_dev(struct amdgpu_device *adev, struct svm_range *prange, unsigned long offset, unsigned long npages, - unsigned long *hmm_pfns, uint32_t gpuidx) + unsigned long *hmm_pfns, uint32_t gpuidx, uint64_t *vram_pages) { enum dma_data_direction dir = DMA_BIDIRECTIONAL; dma_addr_t *addr = prange->dma_addr[gpuidx]; struct device *dev = adev->dev; struct page *page; + uint64_t vram_pages_dev; int i, r; if (!addr) { @@ -173,6 +174,7 @@ svm_range_dma_map_dev(struct amdgpu_device *adev, struct svm_range *prange, prange->dma_addr[gpuidx] = addr; } + vram_pages_dev = 0; addr += offset; for (i = 0; i < npages; i++) { if (svm_is_valid_dma_mapping_addr(dev, addr[i])) @@ -182,6 +184,7 @@ svm_range_dma_map_dev(struct amdgpu_device *adev, struct svm_range *prange, if (is_zone_device_page(page)) { struct amdgpu_device *bo_adev = prange->svm_bo->node->adev; + vram_pages_dev++; addr[i] = (hmm_pfns[i] << PAGE_SHIFT) + bo_adev->vm_manager.vram_base_offset - bo_adev->kfd.pgmap.range.start; @@ -198,13 +201,14 @@ svm_range_dma_map_dev(struct amdgpu_device *adev, struct svm_range *prange, pr_debug_ratelimited("dma mapping 0x%llx for page addr 0x%lx\n", addr[i] >> PAGE_SHIFT, page_to_pfn(page)); } + *vram_pages = vram_pages_dev; return 0; } static int svm_range_dma_map(struct svm_range *prange, unsigned long *bitmap, unsigned long offset, unsigned long npages, - unsigned long *hmm_pfns) + unsigned long *hmm_pfns, uint64_t *vram_pages) { struct kfd_process *p; uint32_t gpuidx; @@ -223,7 +227,7 @@ svm_range_dma_map(struct svm_range *prange, unsigned long *bitmap, } r = svm_range_dma_map_dev(pdd->dev->adev, prange, offset, npages, - hmm_pfns, gpuidx); + hmm_pfns, gpuidx, vram_pages); if (r) break; } @@ -231,7 +235,7 @@ svm_range_dma_map(struct svm_range *prange, unsigned long *bitmap, return r; } -void svm_range_dma_unmap(struct device *dev, dma_addr_t *dma_addr, +void svm_range_dma_unmap_dev(struct device *dev, dma_addr_t *dma_addr, unsigned long offset, unsigned long npages) { enum dma_data_direction dir = DMA_BIDIRECTIONAL; @@ -249,7 +253,7 @@ void svm_range_dma_unmap(struct device *dev, dma_addr_t *dma_addr, } } -void svm_range_free_dma_mappings(struct svm_range *prange, bool unmap_dma) +void svm_range_dma_unmap(struct svm_range *prange) { struct kfd_process_device *pdd; dma_addr_t *dma_addr; @@ -270,10 +274,8 @@ void svm_range_free_dma_mappings(struct svm_range *prange, bool unmap_dma) continue; } dev = &pdd->dev->adev->pdev->dev; - if (unmap_dma) - svm_range_dma_unmap(dev, dma_addr, 0, prange->npages); - kvfree(dma_addr); - prange->dma_addr[gpuidx] = NULL; + + svm_range_dma_unmap_dev(dev, dma_addr, 0, prange->npages); } } @@ -281,18 +283,29 @@ static void svm_range_free(struct svm_range *prange, bool do_unmap) { uint64_t size = (prange->last - prange->start + 1) << PAGE_SHIFT; struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms); + uint32_t gpuidx; pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, prange, prange->start, prange->last); svm_range_vram_node_free(prange); - svm_range_free_dma_mappings(prange, do_unmap); + if (do_unmap) + svm_range_dma_unmap(prange); if (do_unmap && !p->xnack_enabled) { pr_debug("unreserve prange 0x%p size: 0x%llx\n", prange, size); amdgpu_amdkfd_unreserve_mem_limit(NULL, size, KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0); } + + /* free dma_addr array for each gpu */ + for (gpuidx = 0; gpuidx < MAX_GPU_INSTANCE; gpuidx++) { + if (prange->dma_addr[gpuidx]) { + kvfree(prange->dma_addr[gpuidx]); + prange->dma_addr[gpuidx] = NULL; + } + } + mutex_destroy(&prange->lock); mutex_destroy(&prange->migrate_mutex); kfree(prange); @@ -340,6 +353,7 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start, INIT_LIST_HEAD(&prange->child_list); atomic_set(&prange->invalid, 0); prange->validate_timestamp = 0; + prange->vram_pages = 0; mutex_init(&prange->migrate_mutex); mutex_init(&prange->lock); @@ -386,6 +400,8 @@ static void svm_range_bo_release(struct kref *kref) prange->start, prange->last); mutex_lock(&prange->lock); prange->svm_bo = NULL; + /* prange should not hold vram page now */ + WARN_ON(prange->actual_loc); mutex_unlock(&prange->lock); spin_lock(&svm_bo->list_lock); @@ -495,11 +511,11 @@ svm_range_validate_svm_bo(struct kfd_node *node, struct svm_range *prange) /* We need a new svm_bo. Spin-loop to wait for concurrent * svm_range_bo_release to finish removing this range from - * its range list. After this, it is safe to reuse the - * svm_bo pointer and svm_bo_list head. + * its range list and set prange->svm_bo to null. After this, + * it is safe to reuse the svm_bo pointer and svm_bo_list head. */ - while (!list_empty_careful(&prange->svm_bo_list)) - ; + while (!list_empty_careful(&prange->svm_bo_list) || prange->svm_bo) + cond_resched(); return false; } @@ -628,8 +644,15 @@ create_bo_failed: void svm_range_vram_node_free(struct svm_range *prange) { - svm_range_bo_unref(prange->svm_bo); - prange->ttm_res = NULL; + /* serialize prange->svm_bo unref */ + mutex_lock(&prange->lock); + /* prange->svm_bo has not been unref */ + if (prange->ttm_res) { + prange->ttm_res = NULL; + mutex_unlock(&prange->lock); + svm_range_bo_unref(prange->svm_bo); + } else + mutex_unlock(&prange->lock); } struct kfd_node * @@ -820,7 +843,7 @@ svm_range_is_same_attrs(struct kfd_process *p, struct svm_range *prange, } } - return !prange->is_error_flag; + return true; } /** @@ -959,6 +982,11 @@ svm_range_split_nodes(struct svm_range *new, struct svm_range *old, new->svm_bo = svm_range_bo_ref(old->svm_bo); new->ttm_res = old->ttm_res; + /* set new's vram_pages as old range's now, the acurate vram_pages + * will be updated during mapping + */ + new->vram_pages = min(old->vram_pages, new->npages); + spin_lock(&new->svm_bo->list_lock); list_add(&new->svm_bo_list, &new->svm_bo->range_list); spin_unlock(&new->svm_bo->list_lock); @@ -1189,14 +1217,15 @@ svm_range_get_pte_flags(struct kfd_node *node, uint32_t mapping_flags = 0; uint64_t pte_flags; bool snoop = (domain != SVM_RANGE_VRAM_DOMAIN); - bool coherent = flags & KFD_IOCTL_SVM_FLAG_COHERENT; + bool coherent = flags & (KFD_IOCTL_SVM_FLAG_COHERENT | KFD_IOCTL_SVM_FLAG_EXT_COHERENT); + bool ext_coherent = flags & KFD_IOCTL_SVM_FLAG_EXT_COHERENT; bool uncached = false; /*flags & KFD_IOCTL_SVM_FLAG_UNCACHED;*/ unsigned int mtype_local; if (domain == SVM_RANGE_VRAM_DOMAIN) bo_node = prange->svm_bo->node; - switch (node->adev->ip_versions[GC_HWIP][0]) { + switch (amdgpu_ip_version(node->adev, GC_HWIP, 0)) { case IP_VERSION(9, 4, 1): if (domain == SVM_RANGE_VRAM_DOMAIN) { if (bo_node == node) { @@ -1233,7 +1262,8 @@ svm_range_get_pte_flags(struct kfd_node *node, break; case IP_VERSION(9, 4, 3): mtype_local = amdgpu_mtype_local == 1 ? AMDGPU_VM_MTYPE_NC : - (amdgpu_mtype_local == 2 ? AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW); + (amdgpu_mtype_local == 2 || ext_coherent ? + AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW); snoop = true; if (uncached) { mapping_flags |= AMDGPU_VM_MTYPE_UC; @@ -1242,10 +1272,12 @@ svm_range_get_pte_flags(struct kfd_node *node, if (bo_node->adev == node->adev && (!bo_node->xcp || !node->xcp || bo_node->xcp->mem_id == node->xcp->mem_id)) mapping_flags |= mtype_local; - /* local HBM region far from partition or remote XGMI GPU */ - else if (svm_nodes_in_same_hive(bo_node, node)) + /* local HBM region far from partition or remote XGMI GPU + * with regular system scope coherence + */ + else if (svm_nodes_in_same_hive(bo_node, node) && !ext_coherent) mapping_flags |= AMDGPU_VM_MTYPE_NC; - /* PCIe P2P */ + /* PCIe P2P or extended system scope coherence */ else mapping_flags |= AMDGPU_VM_MTYPE_UC; /* system memory accessed by the APU */ @@ -1592,6 +1624,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm, struct svm_validate_context *ctx; unsigned long start, end, addr; struct kfd_process *p; + uint64_t vram_pages; void *owner; int32_t idx; int r = 0; @@ -1660,75 +1693,85 @@ static int svm_range_validate_and_map(struct mm_struct *mm, } } + vram_pages = 0; start = prange->start << PAGE_SHIFT; end = (prange->last + 1) << PAGE_SHIFT; - for (addr = start; addr < end && !r; ) { + for (addr = start; !r && addr < end; ) { struct hmm_range *hmm_range; struct vm_area_struct *vma; - unsigned long next; + uint64_t vram_pages_vma; + unsigned long next = 0; unsigned long offset; unsigned long npages; bool readonly; vma = vma_lookup(mm, addr); - if (!vma) { + if (vma) { + readonly = !(vma->vm_flags & VM_WRITE); + + next = min(vma->vm_end, end); + npages = (next - addr) >> PAGE_SHIFT; + WRITE_ONCE(p->svms.faulting_task, current); + r = amdgpu_hmm_range_get_pages(&prange->notifier, addr, npages, + readonly, owner, NULL, + &hmm_range); + WRITE_ONCE(p->svms.faulting_task, NULL); + if (r) { + pr_debug("failed %d to get svm range pages\n", r); + if (r == -EBUSY) + r = -EAGAIN; + } + } else { r = -EFAULT; - goto unreserve_out; - } - readonly = !(vma->vm_flags & VM_WRITE); - - next = min(vma->vm_end, end); - npages = (next - addr) >> PAGE_SHIFT; - WRITE_ONCE(p->svms.faulting_task, current); - r = amdgpu_hmm_range_get_pages(&prange->notifier, addr, npages, - readonly, owner, NULL, - &hmm_range); - WRITE_ONCE(p->svms.faulting_task, NULL); - if (r) { - pr_debug("failed %d to get svm range pages\n", r); - if (r == -EBUSY) - r = -EAGAIN; - goto unreserve_out; } - offset = (addr - start) >> PAGE_SHIFT; - r = svm_range_dma_map(prange, ctx->bitmap, offset, npages, - hmm_range->hmm_pfns); - if (r) { - pr_debug("failed %d to dma map range\n", r); - goto unreserve_out; + if (!r) { + offset = (addr - start) >> PAGE_SHIFT; + r = svm_range_dma_map(prange, ctx->bitmap, offset, npages, + hmm_range->hmm_pfns, &vram_pages_vma); + if (r) + pr_debug("failed %d to dma map range\n", r); + else + vram_pages += vram_pages_vma; } svm_range_lock(prange); - if (amdgpu_hmm_range_get_pages_done(hmm_range)) { + if (!r && amdgpu_hmm_range_get_pages_done(hmm_range)) { pr_debug("hmm update the range, need validate again\n"); r = -EAGAIN; - goto unlock_out; } - if (!list_empty(&prange->child_list)) { + + if (!r && !list_empty(&prange->child_list)) { pr_debug("range split by unmap in parallel, validate again\n"); r = -EAGAIN; - goto unlock_out; } - r = svm_range_map_to_gpus(prange, offset, npages, readonly, - ctx->bitmap, wait, flush_tlb); + if (!r) + r = svm_range_map_to_gpus(prange, offset, npages, readonly, + ctx->bitmap, wait, flush_tlb); + + if (!r && next == end) + prange->mapped_to_gpu = true; -unlock_out: svm_range_unlock(prange); addr = next; } if (addr == end) { - prange->validated_once = true; - prange->mapped_to_gpu = true; + prange->vram_pages = vram_pages; + + /* if prange does not include any vram page and it + * has not released svm_bo drop its svm_bo reference + * and set its actaul_loc to sys ram + */ + if (!vram_pages && prange->ttm_res) { + prange->actual_loc = 0; + svm_range_vram_node_free(prange); + } } -unreserve_out: svm_range_unreserve_bos(ctx); - - prange->is_error_flag = !!r; if (!r) prange->validate_timestamp = ktime_get_boottime(); @@ -1980,6 +2023,7 @@ static struct svm_range *svm_range_clone(struct svm_range *old) new->actual_loc = old->actual_loc; new->granularity = old->granularity; new->mapped_to_gpu = old->mapped_to_gpu; + new->vram_pages = old->vram_pages; bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE); bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE); @@ -2097,7 +2141,8 @@ svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size, next = interval_tree_iter_next(node, start, last); next_start = min(node->last, last) + 1; - if (svm_range_is_same_attrs(p, prange, nattr, attrs)) { + if (svm_range_is_same_attrs(p, prange, nattr, attrs) && + prange->mapped_to_gpu) { /* nothing to do */ } else if (node->start < start || node->last > last) { /* node intersects the update range and its attributes @@ -2884,6 +2929,7 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, uint32_t vmid, uint32_t node_id, uint64_t addr, bool write_fault) { + unsigned long start, last, size; struct mm_struct *mm = NULL; struct svm_range_list *svms; struct svm_range *prange; @@ -3019,32 +3065,35 @@ retry_write_locked: kfd_smi_event_page_fault_start(node, p->lead_thread->pid, addr, write_fault, timestamp); - if (prange->actual_loc != best_loc) { + if (prange->actual_loc != 0 || best_loc != 0) { migration = true; + /* Align migration range start and size to granularity size */ + size = 1UL << prange->granularity; + start = max_t(unsigned long, ALIGN_DOWN(addr, size), prange->start); + last = min_t(unsigned long, ALIGN(addr + 1, size) - 1, prange->last); + if (best_loc) { - r = svm_migrate_to_vram(prange, best_loc, mm, - KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU); + r = svm_migrate_to_vram(prange, best_loc, start, last, + mm, KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU); if (r) { pr_debug("svm_migrate_to_vram failed (%d) at %llx, falling back to system memory\n", r, addr); /* Fallback to system memory if migration to * VRAM failed */ - if (prange->actual_loc) - r = svm_migrate_vram_to_ram(prange, mm, - KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, - NULL); + if (prange->actual_loc && prange->actual_loc != best_loc) + r = svm_migrate_vram_to_ram(prange, mm, start, last, + KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, NULL); else r = 0; } } else { - r = svm_migrate_vram_to_ram(prange, mm, - KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, - NULL); + r = svm_migrate_vram_to_ram(prange, mm, start, last, + KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, NULL); } if (r) { pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n", - r, svms, prange->start, prange->last); + r, svms, start, last); goto out_unlock_range; } } @@ -3398,18 +3447,24 @@ svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange, *migrated = false; best_loc = svm_range_best_prefetch_location(prange); - if (best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED || - best_loc == prange->actual_loc) + /* when best_loc is a gpu node and same as prange->actual_loc + * we still need do migration as prange->actual_loc !=0 does + * not mean all pages in prange are vram. hmm migrate will pick + * up right pages during migration. + */ + if ((best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED) || + (best_loc == 0 && prange->actual_loc == 0)) return 0; if (!best_loc) { - r = svm_migrate_vram_to_ram(prange, mm, + r = svm_migrate_vram_to_ram(prange, mm, prange->start, prange->last, KFD_MIGRATE_TRIGGER_PREFETCH, NULL); *migrated = !r; return r; } - r = svm_migrate_to_vram(prange, best_loc, mm, KFD_MIGRATE_TRIGGER_PREFETCH); + r = svm_migrate_to_vram(prange, best_loc, prange->start, prange->last, + mm, KFD_MIGRATE_TRIGGER_PREFETCH); *migrated = !r; return r; @@ -3464,7 +3519,11 @@ static void svm_range_evict_svm_bo_worker(struct work_struct *work) mutex_lock(&prange->migrate_mutex); do { + /* migrate all vram pages in this prange to sys ram + * after that prange->actual_loc should be zero + */ r = svm_migrate_vram_to_ram(prange, mm, + prange->start, prange->last, KFD_MIGRATE_TRIGGER_TTM_EVICTION, NULL); } while (!r && prange->actual_loc && --retries); @@ -3507,7 +3566,7 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm, struct svm_range *next; bool update_mapping = false; bool flush_tlb; - int r = 0; + int r, ret = 0; pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] pages 0x%llx\n", p->pasid, &p->svms, start, start + size - 1, size); @@ -3595,7 +3654,7 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm, out_unlock_range: mutex_unlock(&prange->migrate_mutex); if (r) - break; + ret = r; } dynamic_svm_range_dump(svms); @@ -3608,7 +3667,7 @@ out: pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] done, r=%d\n", p->pasid, &p->svms, start, start + size - 1, r); - return r; + return ret ? ret : r; } static int diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h index 9e668eeefb32..be11ba0c4289 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h @@ -78,6 +78,7 @@ struct svm_work_list_item { * @update_list:link list node used to add to update_list * @mapping: bo_va mapping structure to create and update GPU page table * @npages: number of pages + * @vram_pages: vram pages number in this svm_range * @dma_addr: dma mapping address on each GPU for system memory physical page * @ttm_res: vram ttm resource map * @offset: range start offset within mm_nodes @@ -88,7 +89,9 @@ struct svm_work_list_item { * @flags: flags defined as KFD_IOCTL_SVM_FLAG_* * @perferred_loc: perferred location, 0 for CPU, or GPU id * @perfetch_loc: last prefetch location, 0 for CPU, or GPU id - * @actual_loc: the actual location, 0 for CPU, or GPU id + * @actual_loc: this svm_range location. 0: all pages are from sys ram; + * GPU id: this svm_range may include vram pages from GPU with + * id actual_loc. * @granularity:migration granularity, log2 num pages * @invalid: not 0 means cpu page table is invalidated * @validate_timestamp: system timestamp when range is validated @@ -112,6 +115,7 @@ struct svm_range { struct list_head list; struct list_head update_list; uint64_t npages; + uint64_t vram_pages; dma_addr_t *dma_addr[MAX_GPU_INSTANCE]; struct ttm_resource *ttm_res; uint64_t offset; @@ -132,9 +136,7 @@ struct svm_range { struct list_head child_list; DECLARE_BITMAP(bitmap_access, MAX_GPU_INSTANCE); DECLARE_BITMAP(bitmap_aip, MAX_GPU_INSTANCE); - bool validated_once; bool mapped_to_gpu; - bool is_error_flag; }; static inline void svm_range_lock(struct svm_range *prange) @@ -181,9 +183,9 @@ void svm_range_add_list_work(struct svm_range_list *svms, struct svm_range *prange, struct mm_struct *mm, enum svm_work_list_ops op); void schedule_deferred_list_work(struct svm_range_list *svms); -void svm_range_dma_unmap(struct device *dev, dma_addr_t *dma_addr, +void svm_range_dma_unmap_dev(struct device *dev, dma_addr_t *dma_addr, unsigned long offset, unsigned long npages); -void svm_range_free_dma_mappings(struct svm_range *prange, bool unmap_dma); +void svm_range_dma_unmap(struct svm_range *prange); int svm_range_get_info(struct kfd_process *p, uint32_t *num_svm_ranges, uint64_t *svm_priv_data_size); int kfd_criu_checkpoint_svm(struct kfd_process *p, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index c8c75ff7cea8..4e530791507e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -1533,7 +1533,6 @@ out: /* Helper function. See kfd_fill_gpu_cache_info for parameter description */ static int fill_in_l1_pcache(struct kfd_cache_properties **props_ext, struct kfd_gpu_cache_info *pcache_info, - struct kfd_cu_info *cu_info, int cu_bitmask, int cache_type, unsigned int cu_processor_id, int cu_block) @@ -1595,7 +1594,8 @@ static int fill_in_l1_pcache(struct kfd_cache_properties **props_ext, /* Helper function. See kfd_fill_gpu_cache_info for parameter description */ static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext, struct kfd_gpu_cache_info *pcache_info, - struct kfd_cu_info *cu_info, + struct amdgpu_cu_info *cu_info, + struct amdgpu_gfx_config *gfx_info, int cache_type, unsigned int cu_processor_id, struct kfd_node *knode) { @@ -1606,7 +1606,7 @@ static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext, start = ffs(knode->xcc_mask) - 1; end = start + NUM_XCC(knode->xcc_mask); - cu_sibling_map_mask = cu_info->cu_bitmap[start][0][0]; + cu_sibling_map_mask = cu_info->bitmap[start][0][0]; cu_sibling_map_mask &= ((1 << pcache_info[cache_type].num_cu_shared) - 1); first_active_cu = ffs(cu_sibling_map_mask); @@ -1642,15 +1642,15 @@ static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext, k = 0; for (xcc = start; xcc < end; xcc++) { - for (i = 0; i < cu_info->num_shader_engines; i++) { - for (j = 0; j < cu_info->num_shader_arrays_per_engine; j++) { + for (i = 0; i < gfx_info->max_shader_engines; i++) { + for (j = 0; j < gfx_info->max_sh_per_se; j++) { pcache->sibling_map[k] = (uint8_t)(cu_sibling_map_mask & 0xFF); pcache->sibling_map[k+1] = (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF); pcache->sibling_map[k+2] = (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF); pcache->sibling_map[k+3] = (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF); k += 4; - cu_sibling_map_mask = cu_info->cu_bitmap[xcc][i % 4][j + i / 4]; + cu_sibling_map_mask = cu_info->bitmap[xcc][i % 4][j + i / 4]; cu_sibling_map_mask &= ((1 << pcache_info[cache_type].num_cu_shared) - 1); } } @@ -1675,16 +1675,14 @@ static void kfd_fill_cache_non_crat_info(struct kfd_topology_device *dev, struct unsigned int cu_processor_id; int ret; unsigned int num_cu_shared; - struct kfd_cu_info cu_info; - struct kfd_cu_info *pcu_info; + struct amdgpu_cu_info *cu_info = &kdev->adev->gfx.cu_info; + struct amdgpu_gfx_config *gfx_info = &kdev->adev->gfx.config; int gpu_processor_id; struct kfd_cache_properties *props_ext; int num_of_entries = 0; int num_of_cache_types = 0; struct kfd_gpu_cache_info cache_info[KFD_MAX_CACHE_TYPES]; - amdgpu_amdkfd_get_cu_info(kdev->adev, &cu_info); - pcu_info = &cu_info; gpu_processor_id = dev->node_props.simd_id_base; @@ -1711,12 +1709,12 @@ static void kfd_fill_cache_non_crat_info(struct kfd_topology_device *dev, struct cu_processor_id = gpu_processor_id; if (pcache_info[ct].cache_level == 1) { for (xcc = start; xcc < end; xcc++) { - for (i = 0; i < pcu_info->num_shader_engines; i++) { - for (j = 0; j < pcu_info->num_shader_arrays_per_engine; j++) { - for (k = 0; k < pcu_info->num_cu_per_sh; k += pcache_info[ct].num_cu_shared) { + for (i = 0; i < gfx_info->max_shader_engines; i++) { + for (j = 0; j < gfx_info->max_sh_per_se; j++) { + for (k = 0; k < gfx_info->max_cu_per_sh; k += pcache_info[ct].num_cu_shared) { - ret = fill_in_l1_pcache(&props_ext, pcache_info, pcu_info, - pcu_info->cu_bitmap[xcc][i % 4][j + i / 4], ct, + ret = fill_in_l1_pcache(&props_ext, pcache_info, + cu_info->bitmap[xcc][i % 4][j + i / 4], ct, cu_processor_id, k); if (ret < 0) @@ -1729,9 +1727,9 @@ static void kfd_fill_cache_non_crat_info(struct kfd_topology_device *dev, struct /* Move to next CU block */ num_cu_shared = ((k + pcache_info[ct].num_cu_shared) <= - pcu_info->num_cu_per_sh) ? + gfx_info->max_cu_per_sh) ? pcache_info[ct].num_cu_shared : - (pcu_info->num_cu_per_sh - k); + (gfx_info->max_cu_per_sh - k); cu_processor_id += num_cu_shared; } } @@ -1739,7 +1737,7 @@ static void kfd_fill_cache_non_crat_info(struct kfd_topology_device *dev, struct } } else { ret = fill_in_l2_l3_pcache(&props_ext, pcache_info, - pcu_info, ct, cu_processor_id, kdev); + cu_info, gfx_info, ct, cu_processor_id, kdev); if (ret < 0) break; @@ -1918,10 +1916,11 @@ int kfd_topology_add_device(struct kfd_node *gpu) { uint32_t gpu_id; struct kfd_topology_device *dev; - struct kfd_cu_info cu_info; int res = 0; int i; const char *asic_name = amdgpu_asic_name[gpu->adev->asic_type]; + struct amdgpu_gfx_config *gfx_info = &gpu->adev->gfx.config; + struct amdgpu_cu_info *cu_info = &gpu->adev->gfx.cu_info; gpu_id = kfd_generate_gpu_id(gpu); if (gpu->xcp && !gpu->xcp->ddev) { @@ -1959,9 +1958,6 @@ int kfd_topology_add_device(struct kfd_node *gpu) /* Fill-in additional information that is not available in CRAT but * needed for the topology */ - - amdgpu_amdkfd_get_cu_info(dev->gpu->adev, &cu_info); - for (i = 0; i < KFD_TOPOLOGY_PUBLIC_NAME_SIZE-1; i++) { dev->node_props.name[i] = __tolower(asic_name[i]); if (asic_name[i] == '\0') @@ -1970,7 +1966,7 @@ int kfd_topology_add_device(struct kfd_node *gpu) dev->node_props.name[i] = '\0'; dev->node_props.simd_arrays_per_engine = - cu_info.num_shader_arrays_per_engine; + gfx_info->max_sh_per_se; dev->node_props.gfx_target_version = gpu->kfd->device_info.gfx_target_version; @@ -2051,7 +2047,7 @@ int kfd_topology_add_device(struct kfd_node *gpu) */ if (dev->gpu->adev->asic_type == CHIP_CARRIZO) { dev->node_props.simd_count = - cu_info.simd_per_cu * cu_info.cu_active_number; + cu_info->simd_per_cu * cu_info->number; dev->node_props.max_waves_per_simd = 10; } |