diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 598 |
1 files changed, 529 insertions, 69 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 0b655555e167..d4593374e7a1 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -35,7 +35,8 @@ #include "cik_regs.h" #include "kfd_kernel_queue.h" #include "amdgpu_amdkfd.h" -#include "mes_api_def.h" +#include "amdgpu_reset.h" +#include "mes_v11_api_def.h" #include "kfd_debug.h" /* Size of the per-pipe EOP queue */ @@ -152,17 +153,24 @@ void program_sh_mem_settings(struct device_queue_manager *dqm, static void kfd_hws_hang(struct device_queue_manager *dqm) { + struct device_process_node *cur; + struct qcm_process_device *qpd; + struct queue *q; + + /* Mark all device queues as reset. */ + list_for_each_entry(cur, &dqm->queues, list) { + qpd = cur->qpd; + list_for_each_entry(q, &qpd->queues_list, list) { + struct kfd_process_device *pdd = qpd_to_pdd(qpd); + + pdd->has_reset_queue = true; + } + } + /* * Issue a GPU reset if HWS is unresponsive */ - dqm->is_hws_hang = true; - - /* It's possible we're detecting a HWS hang in the - * middle of a GPU reset. No need to schedule another - * reset in this case. - */ - if (!dqm->is_resetting) - schedule_work(&dqm->hw_exception_work); + schedule_work(&dqm->hw_exception_work); } static int convert_to_mes_queue_type(int queue_type) @@ -194,9 +202,26 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q, int r, queue_type; uint64_t wptr_addr_off; - if (dqm->is_hws_hang) + if (!dqm->sched_running || dqm->sched_halt) + return 0; + if (!down_read_trylock(&adev->reset_domain->sem)) return -EIO; + if (!pdd->proc_ctx_cpu_ptr) { + r = amdgpu_amdkfd_alloc_gtt_mem(adev, + AMDGPU_MES_PROC_CTX_SIZE, + &pdd->proc_ctx_bo, + &pdd->proc_ctx_gpu_addr, + &pdd->proc_ctx_cpu_ptr, + false); + if (r) { + dev_err(adev->dev, + "failed to allocate process context bo\n"); + return r; + } + memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE); + } + memset(&queue_input, 0x0, sizeof(struct mes_add_queue_input)); queue_input.process_id = qpd->pqm->process->pasid; queue_input.page_table_base_addr = qpd->page_table_base; @@ -214,10 +239,8 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q, queue_input.mqd_addr = q->gart_mqd_addr; queue_input.wptr_addr = (uint64_t)q->properties.write_ptr; - if (q->wptr_bo) { - wptr_addr_off = (uint64_t)q->properties.write_ptr & (PAGE_SIZE - 1); - queue_input.wptr_mc_addr = amdgpu_bo_gpu_offset(q->wptr_bo) + wptr_addr_off; - } + wptr_addr_off = (uint64_t)q->properties.write_ptr & (PAGE_SIZE - 1); + queue_input.wptr_mc_addr = amdgpu_bo_gpu_offset(q->properties.wptr_bo) + wptr_addr_off; queue_input.is_kfd_process = 1; queue_input.is_aql_queue = (q->properties.format == KFD_QUEUE_FORMAT_AQL); @@ -236,6 +259,7 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q, if (queue_type < 0) { dev_err(adev->dev, "Queue type not supported with MES, queue:%d\n", q->properties.type); + up_read(&adev->reset_domain->sem); return -EINVAL; } queue_input.queue_type = (uint32_t)queue_type; @@ -245,6 +269,7 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q, amdgpu_mes_lock(&adev->mes); r = adev->mes.funcs->add_hw_queue(&adev->mes, &queue_input); amdgpu_mes_unlock(&adev->mes); + up_read(&adev->reset_domain->sem); if (r) { dev_err(adev->dev, "failed to add hardware queue to MES, doorbell=0x%x\n", q->properties.doorbell_off); @@ -262,7 +287,9 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q, int r; struct mes_remove_queue_input queue_input; - if (dqm->is_hws_hang) + if (!dqm->sched_running || dqm->sched_halt) + return 0; + if (!down_read_trylock(&adev->reset_domain->sem)) return -EIO; memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input)); @@ -272,6 +299,7 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q, amdgpu_mes_lock(&adev->mes); r = adev->mes.funcs->remove_hw_queue(&adev->mes, &queue_input); amdgpu_mes_unlock(&adev->mes); + up_read(&adev->reset_domain->sem); if (r) { dev_err(adev->dev, "failed to remove hardware queue from MES, doorbell=0x%x\n", @@ -283,7 +311,7 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q, return r; } -static int remove_all_queues_mes(struct device_queue_manager *dqm) +static int remove_all_kfd_queues_mes(struct device_queue_manager *dqm) { struct device_process_node *cur; struct device *dev = dqm->dev->adev->dev; @@ -310,6 +338,73 @@ static int remove_all_queues_mes(struct device_queue_manager *dqm) return retval; } +static int add_all_kfd_queues_mes(struct device_queue_manager *dqm) +{ + struct device_process_node *cur; + struct device *dev = dqm->dev->adev->dev; + struct qcm_process_device *qpd; + struct queue *q; + int retval = 0; + + list_for_each_entry(cur, &dqm->queues, list) { + qpd = cur->qpd; + list_for_each_entry(q, &qpd->queues_list, list) { + if (!q->properties.is_active) + continue; + retval = add_queue_mes(dqm, q, qpd); + if (retval) { + dev_err(dev, "%s: Failed to add queue %d for dev %d", + __func__, + q->properties.queue_id, + dqm->dev->id); + return retval; + } + } + } + + return retval; +} + +static int suspend_all_queues_mes(struct device_queue_manager *dqm) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev; + int r = 0; + + if (!down_read_trylock(&adev->reset_domain->sem)) + return -EIO; + + r = amdgpu_mes_suspend(adev); + up_read(&adev->reset_domain->sem); + + if (r) { + dev_err(adev->dev, "failed to suspend gangs from MES\n"); + dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n"); + kfd_hws_hang(dqm); + } + + return r; +} + +static int resume_all_queues_mes(struct device_queue_manager *dqm) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev; + int r = 0; + + if (!down_read_trylock(&adev->reset_domain->sem)) + return -EIO; + + r = amdgpu_mes_resume(adev); + up_read(&adev->reset_domain->sem); + + if (r) { + dev_err(adev->dev, "failed to resume gangs from MES\n"); + dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n"); + kfd_hws_hang(dqm); + } + + return r; +} + static void increment_queue_count(struct device_queue_manager *dqm, struct qcm_process_device *qpd, struct queue *q) @@ -883,6 +978,12 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q, else if (prev_active) retval = remove_queue_mes(dqm, q, &pdd->qpd); + /* queue is reset so inaccessable */ + if (pdd->has_reset_queue) { + retval = -EACCES; + goto out_unlock; + } + if (retval) { dev_err(dev, "unmap queue failed\n"); goto out_unlock; @@ -1468,20 +1569,13 @@ static int stop_nocpsch(struct device_queue_manager *dqm) } if (dqm->dev->adev->asic_type == CHIP_HAWAII) - pm_uninit(&dqm->packet_mgr, false); + pm_uninit(&dqm->packet_mgr); dqm->sched_running = false; dqm_unlock(dqm); return 0; } -static void pre_reset(struct device_queue_manager *dqm) -{ - dqm_lock(dqm); - dqm->is_resetting = true; - dqm_unlock(dqm); -} - static int allocate_sdma_queue(struct device_queue_manager *dqm, struct queue *q, const uint32_t *restore_sdma_id) { @@ -1544,6 +1638,41 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm, q->sdma_id % kfd_get_num_xgmi_sdma_engines(dqm->dev); q->properties.sdma_queue_id = q->sdma_id / kfd_get_num_xgmi_sdma_engines(dqm->dev); + } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) { + int i, num_queues, num_engines, eng_offset = 0, start_engine; + bool free_bit_found = false, is_xgmi = false; + + if (q->properties.sdma_engine_id < kfd_get_num_sdma_engines(dqm->dev)) { + num_queues = get_num_sdma_queues(dqm); + num_engines = kfd_get_num_sdma_engines(dqm->dev); + q->properties.type = KFD_QUEUE_TYPE_SDMA; + } else { + num_queues = get_num_xgmi_sdma_queues(dqm); + num_engines = kfd_get_num_xgmi_sdma_engines(dqm->dev); + eng_offset = kfd_get_num_sdma_engines(dqm->dev); + q->properties.type = KFD_QUEUE_TYPE_SDMA_XGMI; + is_xgmi = true; + } + + /* Scan available bit based on target engine ID. */ + start_engine = q->properties.sdma_engine_id - eng_offset; + for (i = start_engine; i < num_queues; i += num_engines) { + + if (!test_bit(i, is_xgmi ? dqm->xgmi_sdma_bitmap : dqm->sdma_bitmap)) + continue; + + clear_bit(i, is_xgmi ? dqm->xgmi_sdma_bitmap : dqm->sdma_bitmap); + q->sdma_id = i; + q->properties.sdma_queue_id = q->sdma_id / num_engines; + free_bit_found = true; + break; + } + + if (!free_bit_found) { + dev_err(dev, "No more SDMA queue to allocate for target ID %i\n", + q->properties.sdma_engine_id); + return -ENOMEM; + } } pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id); @@ -1636,10 +1765,67 @@ static int initialize_cpsch(struct device_queue_manager *dqm) return 0; } +/* halt_cpsch: + * Unmap queues so the schedule doesn't continue remaining jobs in the queue. + * Then set dqm->sched_halt so queues don't map to runlist until unhalt_cpsch + * is called. + */ +static int halt_cpsch(struct device_queue_manager *dqm) +{ + int ret = 0; + + dqm_lock(dqm); + if (!dqm->sched_running) { + dqm_unlock(dqm); + return 0; + } + + WARN_ONCE(dqm->sched_halt, "Scheduling is already on halt\n"); + + if (!dqm->is_hws_hang) { + if (!dqm->dev->kfd->shared_resources.enable_mes) + ret = unmap_queues_cpsch(dqm, + KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, + USE_DEFAULT_GRACE_PERIOD, false); + else + ret = remove_all_kfd_queues_mes(dqm); + } + dqm->sched_halt = true; + dqm_unlock(dqm); + + return ret; +} + +/* unhalt_cpsch + * Unset dqm->sched_halt and map queues back to runlist + */ +static int unhalt_cpsch(struct device_queue_manager *dqm) +{ + int ret = 0; + + dqm_lock(dqm); + if (!dqm->sched_running || !dqm->sched_halt) { + WARN_ONCE(!dqm->sched_halt, "Scheduling is not on halt.\n"); + dqm_unlock(dqm); + return 0; + } + dqm->sched_halt = false; + if (!dqm->dev->kfd->shared_resources.enable_mes) + ret = execute_queues_cpsch(dqm, + KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, + 0, USE_DEFAULT_GRACE_PERIOD); + else + ret = add_all_kfd_queues_mes(dqm); + + dqm_unlock(dqm); + + return ret; +} + static int start_cpsch(struct device_queue_manager *dqm) { struct device *dev = dqm->dev->adev->dev; - int retval; + int retval, num_hw_queue_slots; retval = 0; @@ -1669,8 +1855,6 @@ static int start_cpsch(struct device_queue_manager *dqm) init_interrupts(dqm); /* clear hang status when driver try to start the hw scheduler */ - dqm->is_hws_hang = false; - dqm->is_resetting = false; dqm->sched_running = true; if (!dqm->dev->kfd->shared_resources.enable_mes) @@ -1694,13 +1878,28 @@ static int start_cpsch(struct device_queue_manager *dqm) &dqm->wait_times); } + /* setup per-queue reset detection buffer */ + num_hw_queue_slots = dqm->dev->kfd->shared_resources.num_queue_per_pipe * + dqm->dev->kfd->shared_resources.num_pipe_per_mec * + NUM_XCC(dqm->dev->xcc_mask); + + dqm->detect_hang_info_size = num_hw_queue_slots * sizeof(struct dqm_detect_hang_info); + dqm->detect_hang_info = kzalloc(dqm->detect_hang_info_size, GFP_KERNEL); + + if (!dqm->detect_hang_info) { + retval = -ENOMEM; + goto fail_detect_hang_buffer; + } + dqm_unlock(dqm); return 0; +fail_detect_hang_buffer: + kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); fail_allocate_vidmem: fail_set_sched_resources: if (!dqm->dev->kfd->shared_resources.enable_mes) - pm_uninit(&dqm->packet_mgr, false); + pm_uninit(&dqm->packet_mgr); fail_packet_manager_init: dqm_unlock(dqm); return retval; @@ -1708,22 +1907,17 @@ fail_packet_manager_init: static int stop_cpsch(struct device_queue_manager *dqm) { - bool hanging; - dqm_lock(dqm); if (!dqm->sched_running) { dqm_unlock(dqm); return 0; } - if (!dqm->is_hws_hang) { - if (!dqm->dev->kfd->shared_resources.enable_mes) - unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD, false); - else - remove_all_queues_mes(dqm); - } + if (!dqm->dev->kfd->shared_resources.enable_mes) + unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD, false); + else + remove_all_kfd_queues_mes(dqm); - hanging = dqm->is_hws_hang || dqm->is_resetting; dqm->sched_running = false; if (!dqm->dev->kfd->shared_resources.enable_mes) @@ -1731,7 +1925,9 @@ static int stop_cpsch(struct device_queue_manager *dqm) kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); if (!dqm->dev->kfd->shared_resources.enable_mes) - pm_uninit(&dqm->packet_mgr, hanging); + pm_uninit(&dqm->packet_mgr); + kfree(dqm->detect_hang_info); + dqm->detect_hang_info = NULL; dqm_unlock(dqm); return 0; @@ -1803,7 +1999,8 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q, } if (q->properties.type == KFD_QUEUE_TYPE_SDMA || - q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { + q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI || + q->properties.type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) { dqm_lock(dqm); retval = allocate_sdma_queue(dqm, q, qd ? &qd->sdma_id : NULL); dqm_unlock(dqm); @@ -1900,7 +2097,7 @@ int amdkfd_fence_wait_timeout(struct device_queue_manager *dqm, { unsigned long end_jiffies = msecs_to_jiffies(timeout_ms) + jiffies; struct device *dev = dqm->dev->adev->dev; - uint64_t *fence_addr = dqm->fence_addr; + uint64_t *fence_addr = dqm->fence_addr; while (*fence_addr != fence_value) { /* Fatal err detected, this response won't come */ @@ -1930,7 +2127,7 @@ static int map_queues_cpsch(struct device_queue_manager *dqm) struct device *dev = dqm->dev->adev->dev; int retval; - if (!dqm->sched_running) + if (!dqm->sched_running || dqm->sched_halt) return 0; if (dqm->active_queue_count <= 0 || dqm->processes_count <= 0) return 0; @@ -1948,6 +2145,135 @@ static int map_queues_cpsch(struct device_queue_manager *dqm) return retval; } +static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue *q, + struct qcm_process_device *qpd) +{ + struct kfd_process_device *pdd = qpd_to_pdd(qpd); + + dev_err(dqm->dev->adev->dev, "queue id 0x%0x at pasid 0x%0x is reset\n", + q->properties.queue_id, q->process->pasid); + + pdd->has_reset_queue = true; + if (q->properties.is_active) { + q->properties.is_active = false; + decrement_queue_count(dqm, qpd, q); + } +} + +static int detect_queue_hang(struct device_queue_manager *dqm) +{ + int i; + + /* detect should be used only in dqm locked queue reset */ + if (WARN_ON(dqm->detect_hang_count > 0)) + return 0; + + memset(dqm->detect_hang_info, 0, dqm->detect_hang_info_size); + + for (i = 0; i < AMDGPU_MAX_QUEUES; ++i) { + uint32_t mec, pipe, queue; + int xcc_id; + + mec = (i / dqm->dev->kfd->shared_resources.num_queue_per_pipe) + / dqm->dev->kfd->shared_resources.num_pipe_per_mec; + + if (mec || !test_bit(i, dqm->dev->kfd->shared_resources.cp_queue_bitmap)) + continue; + + amdgpu_queue_mask_bit_to_mec_queue(dqm->dev->adev, i, &mec, &pipe, &queue); + + for_each_inst(xcc_id, dqm->dev->xcc_mask) { + uint64_t queue_addr = dqm->dev->kfd2kgd->hqd_get_pq_addr( + dqm->dev->adev, pipe, queue, xcc_id); + struct dqm_detect_hang_info hang_info; + + if (!queue_addr) + continue; + + hang_info.pipe_id = pipe; + hang_info.queue_id = queue; + hang_info.xcc_id = xcc_id; + hang_info.queue_address = queue_addr; + + dqm->detect_hang_info[dqm->detect_hang_count] = hang_info; + dqm->detect_hang_count++; + } + } + + return dqm->detect_hang_count; +} + +static struct queue *find_queue_by_address(struct device_queue_manager *dqm, uint64_t queue_address) +{ + struct device_process_node *cur; + struct qcm_process_device *qpd; + struct queue *q; + + list_for_each_entry(cur, &dqm->queues, list) { + qpd = cur->qpd; + list_for_each_entry(q, &qpd->queues_list, list) { + if (queue_address == q->properties.queue_address) + return q; + } + } + + return NULL; +} + +/* only for compute queue */ +static int reset_queues_on_hws_hang(struct device_queue_manager *dqm) +{ + int r = 0, reset_count = 0, i; + + if (!dqm->detect_hang_info || dqm->is_hws_hang) + return -EIO; + + /* assume dqm locked. */ + if (!detect_queue_hang(dqm)) + return -ENOTRECOVERABLE; + + for (i = 0; i < dqm->detect_hang_count; i++) { + struct dqm_detect_hang_info hang_info = dqm->detect_hang_info[i]; + struct queue *q = find_queue_by_address(dqm, hang_info.queue_address); + struct kfd_process_device *pdd; + uint64_t queue_addr = 0; + + if (!q) { + r = -ENOTRECOVERABLE; + goto reset_fail; + } + + pdd = kfd_get_process_device_data(dqm->dev, q->process); + if (!pdd) { + r = -ENOTRECOVERABLE; + goto reset_fail; + } + + queue_addr = dqm->dev->kfd2kgd->hqd_reset(dqm->dev->adev, + hang_info.pipe_id, hang_info.queue_id, hang_info.xcc_id, + KFD_UNMAP_LATENCY_MS); + + /* either reset failed or we reset an unexpected queue. */ + if (queue_addr != q->properties.queue_address) { + r = -ENOTRECOVERABLE; + goto reset_fail; + } + + set_queue_as_reset(dqm, q, &pdd->qpd); + reset_count++; + } + + if (reset_count == dqm->detect_hang_count) + kfd_signal_reset_event(dqm->dev); + else + r = -ENOTRECOVERABLE; + +reset_fail: + dqm->detect_hang_count = 0; + + return r; +} + /* dqm->lock mutex has to be locked before calling this function */ static int unmap_queues_cpsch(struct device_queue_manager *dqm, enum kfd_unmap_queues_filter filter, @@ -1957,26 +2283,27 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, { struct device *dev = dqm->dev->adev->dev; struct mqd_manager *mqd_mgr; - int retval = 0; + int retval; if (!dqm->sched_running) return 0; - if (dqm->is_hws_hang || dqm->is_resetting) - return -EIO; if (!dqm->active_runlist) - return retval; + return 0; + if (!down_read_trylock(&dqm->dev->adev->reset_domain->sem)) + return -EIO; if (grace_period != USE_DEFAULT_GRACE_PERIOD) { retval = pm_update_grace_period(&dqm->packet_mgr, grace_period); if (retval) - return retval; + goto out; } retval = pm_send_unmap_queue(&dqm->packet_mgr, filter, filter_param, reset); if (retval) - return retval; + goto out; *dqm->fence_addr = KFD_FENCE_INIT; + mb(); pm_send_query_status(&dqm->packet_mgr, dqm->fence_gpu_addr, KFD_FENCE_COMPLETED); /* should be timed out */ @@ -1985,7 +2312,7 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, if (retval) { dev_err(dev, "The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n"); kfd_hws_hang(dqm); - return retval; + goto out; } /* In the current MEC firmware implementation, if compute queue @@ -1997,12 +2324,15 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, * check those fields */ mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]; - if (mqd_mgr->read_doorbell_id(dqm->packet_mgr.priv_queue->queue->mqd)) { - dev_err(dev, "HIQ MQD's queue_doorbell_id0 is not 0, Queue preemption time out\n"); + if (mqd_mgr->check_preemption_failed(mqd_mgr, dqm->packet_mgr.priv_queue->queue->mqd)) { while (halt_if_hws_hang) schedule(); - kfd_hws_hang(dqm); - return -ETIME; + if (reset_queues_on_hws_hang(dqm)) { + dqm->is_hws_hang = true; + kfd_hws_hang(dqm); + retval = -ETIME; + goto out; + } } /* We need to reset the grace period value for this device */ @@ -2015,12 +2345,13 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, pm_release_ib(&dqm->packet_mgr); dqm->active_runlist = false; +out: + up_read(&dqm->dev->adev->reset_domain->sem); return retval; } /* only for compute queue */ -static int reset_queues_cpsch(struct device_queue_manager *dqm, - uint16_t pasid) +static int reset_queues_cpsch(struct device_queue_manager *dqm, uint16_t pasid) { int retval; @@ -2041,13 +2372,13 @@ static int execute_queues_cpsch(struct device_queue_manager *dqm, { int retval; - if (dqm->is_hws_hang) + if (!down_read_trylock(&dqm->dev->adev->reset_domain->sem)) return -EIO; retval = unmap_queues_cpsch(dqm, filter, filter_param, grace_period, false); - if (retval) - return retval; - - return map_queues_cpsch(dqm); + if (!retval) + retval = map_queues_cpsch(dqm); + up_read(&dqm->dev->adev->reset_domain->sem); + return retval; } static int wait_on_destroy_queue(struct device_queue_manager *dqm, @@ -2057,6 +2388,9 @@ static int wait_on_destroy_queue(struct device_queue_manager *dqm, q->process); int ret = 0; + if (WARN_ON(!pdd)) + return ret; + if (pdd->qpd.is_debug) return ret; @@ -2126,10 +2460,9 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, pdd->sdma_past_activity_counter += sdma_val; } - list_del(&q->list); - qpd->queue_count--; if (q->properties.is_active) { decrement_queue_count(dqm, qpd, q); + q->properties.is_active = false; if (!dqm->dev->kfd->shared_resources.enable_mes) { retval = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, @@ -2140,6 +2473,8 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, retval = remove_queue_mes(dqm, q, qpd); } } + list_del(&q->list); + qpd->queue_count--; /* * Unconditionally decrement this counter, regardless of the queue's @@ -2428,10 +2763,12 @@ static int process_termination_cpsch(struct device_queue_manager *dqm, if (!dqm->dev->kfd->shared_resources.enable_mes) retval = execute_queues_cpsch(dqm, filter, 0, USE_DEFAULT_GRACE_PERIOD); - if ((!dqm->is_hws_hang) && (retval || qpd->reset_wavefronts)) { + if ((retval || qpd->reset_wavefronts) && + down_read_trylock(&dqm->dev->adev->reset_domain->sem)) { pr_warn("Resetting wave fronts (cpsch) on dev %p\n", dqm->dev); dbgdev_wave_reset_wavefronts(dqm->dev, qpd->pqm->process); qpd->reset_wavefronts = false; + up_read(&dqm->dev->adev->reset_domain->sem); } /* Lastly, free mqd resources. @@ -2538,7 +2875,8 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_node *dev) dqm->ops.initialize = initialize_cpsch; dqm->ops.start = start_cpsch; dqm->ops.stop = stop_cpsch; - dqm->ops.pre_reset = pre_reset; + dqm->ops.halt = halt_cpsch; + dqm->ops.unhalt = unhalt_cpsch; dqm->ops.destroy_queue = destroy_queue_cpsch; dqm->ops.update_queue = update_queue; dqm->ops.register_process = register_process; @@ -2559,7 +2897,6 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_node *dev) /* initialize dqm for no cp scheduling */ dqm->ops.start = start_nocpsch; dqm->ops.stop = stop_nocpsch; - dqm->ops.pre_reset = pre_reset; dqm->ops.create_queue = create_queue_nocpsch; dqm->ops.destroy_queue = destroy_queue_nocpsch; dqm->ops.update_queue = update_queue; @@ -2598,7 +2935,9 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_node *dev) break; default: - if (KFD_GC_VERSION(dev) >= IP_VERSION(11, 0, 0)) + if (KFD_GC_VERSION(dev) >= IP_VERSION(12, 0, 0)) + device_queue_manager_init_v12(&dqm->asic_ops); + else if (KFD_GC_VERSION(dev) >= IP_VERSION(11, 0, 0)) device_queue_manager_init_v11(&dqm->asic_ops); else if (KFD_GC_VERSION(dev) >= IP_VERSION(10, 1, 1)) device_queue_manager_init_v10(&dqm->asic_ops); @@ -2634,7 +2973,7 @@ static void deallocate_hiq_sdma_mqd(struct kfd_node *dev, { WARN(!mqd, "No hiq sdma mqd trunk to free"); - amdgpu_amdkfd_free_gtt_mem(dev->adev, mqd->gtt_mem); + amdgpu_amdkfd_free_gtt_mem(dev->adev, &mqd->gtt_mem); } void device_queue_manager_uninit(struct device_queue_manager *dqm) @@ -2646,6 +2985,95 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm) kfree(dqm); } +int kfd_dqm_suspend_bad_queue_mes(struct kfd_node *knode, u32 pasid, u32 doorbell_id) +{ + struct kfd_process_device *pdd; + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + struct device_queue_manager *dqm = knode->dqm; + struct device *dev = dqm->dev->adev->dev; + struct qcm_process_device *qpd; + struct queue *q = NULL; + int ret = 0; + + if (!p) + return -EINVAL; + + dqm_lock(dqm); + + pdd = kfd_get_process_device_data(dqm->dev, p); + if (pdd) { + qpd = &pdd->qpd; + + list_for_each_entry(q, &qpd->queues_list, list) { + if (q->doorbell_id == doorbell_id && q->properties.is_active) { + ret = suspend_all_queues_mes(dqm); + if (ret) { + dev_err(dev, "Suspending all queues failed"); + goto out; + } + + q->properties.is_evicted = true; + q->properties.is_active = false; + decrement_queue_count(dqm, qpd, q); + + ret = remove_queue_mes(dqm, q, qpd); + if (ret) { + dev_err(dev, "Removing bad queue failed"); + goto out; + } + + ret = resume_all_queues_mes(dqm); + if (ret) + dev_err(dev, "Resuming all queues failed"); + + break; + } + } + } + +out: + dqm_unlock(dqm); + return ret; +} + +static int kfd_dqm_evict_pasid_mes(struct device_queue_manager *dqm, + struct qcm_process_device *qpd) +{ + struct device *dev = dqm->dev->adev->dev; + int ret = 0; + + /* Check if process is already evicted */ + dqm_lock(dqm); + if (qpd->evicted) { + /* Increment the evicted count to make sure the + * process stays evicted before its terminated. + */ + qpd->evicted++; + dqm_unlock(dqm); + goto out; + } + dqm_unlock(dqm); + + ret = suspend_all_queues_mes(dqm); + if (ret) { + dev_err(dev, "Suspending all queues failed"); + goto out; + } + + ret = dqm->ops.evict_process_queues(dqm, qpd); + if (ret) { + dev_err(dev, "Evicting process queues failed"); + goto out; + } + + ret = resume_all_queues_mes(dqm); + if (ret) + dev_err(dev, "Resuming all queues failed"); + +out: + return ret; +} + int kfd_dqm_evict_pasid(struct device_queue_manager *dqm, u32 pasid) { struct kfd_process_device *pdd; @@ -2656,8 +3084,13 @@ int kfd_dqm_evict_pasid(struct device_queue_manager *dqm, u32 pasid) return -EINVAL; WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid); pdd = kfd_get_process_device_data(dqm->dev, p); - if (pdd) - ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd); + if (pdd) { + if (dqm->dev->kfd->shared_resources.enable_mes) + ret = kfd_dqm_evict_pasid_mes(dqm, &pdd->qpd); + else + ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd); + } + kfd_unref_process(p); return ret; @@ -2793,7 +3226,7 @@ struct copy_context_work_handler_workarea { struct kfd_process *p; }; -static void copy_context_work_handler (struct work_struct *work) +static void copy_context_work_handler(struct work_struct *work) { struct copy_context_work_handler_workarea *workarea; struct mqd_manager *mqd_mgr; @@ -2820,6 +3253,9 @@ static void copy_context_work_handler (struct work_struct *work) struct qcm_process_device *qpd = &pdd->qpd; list_for_each_entry(q, &qpd->queues_list, list) { + if (q->properties.type != KFD_QUEUE_TYPE_COMPUTE) + continue; + mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_CP]; /* We ignore the return value from get_wave_state @@ -3160,6 +3596,30 @@ int debug_refresh_runlist(struct device_queue_manager *dqm) return debug_map_and_unlock(dqm); } +bool kfd_dqm_is_queue_in_process(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + int doorbell_off, u32 *queue_format) +{ + struct queue *q; + bool r = false; + + if (!queue_format) + return r; + + dqm_lock(dqm); + + list_for_each_entry(q, &qpd->queues_list, list) { + if (q->properties.doorbell_off == doorbell_off) { + *queue_format = q->properties.format; + r = true; + goto out; + } + } + +out: + dqm_unlock(dqm); + return r; +} #if defined(CONFIG_DEBUG_FS) static void seq_reg_dump(struct seq_file *m, |