diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 447 |
1 files changed, 443 insertions, 4 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 44d87943e40a..bc9e81293165 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -952,6 +952,92 @@ out_unlock: return retval; } +/* suspend_single_queue does not lock the dqm like the + * evict_process_queues_cpsch or evict_process_queues_nocpsch. You should + * lock the dqm before calling, and unlock after calling. + * + * The reason we don't lock the dqm is because this function may be + * called on multiple queues in a loop, so rather than locking/unlocking + * multiple times, we will just keep the dqm locked for all of the calls. + */ +static int suspend_single_queue(struct device_queue_manager *dqm, + struct kfd_process_device *pdd, + struct queue *q) +{ + bool is_new; + + if (q->properties.is_suspended) + return 0; + + pr_debug("Suspending PASID %u queue [%i]\n", + pdd->process->pasid, + q->properties.queue_id); + + is_new = q->properties.exception_status & KFD_EC_MASK(EC_QUEUE_NEW); + + if (is_new || q->properties.is_being_destroyed) { + pr_debug("Suspend: skip %s queue id %i\n", + is_new ? "new" : "destroyed", + q->properties.queue_id); + return -EBUSY; + } + + q->properties.is_suspended = true; + if (q->properties.is_active) { + if (dqm->dev->kfd->shared_resources.enable_mes) { + int r = remove_queue_mes(dqm, q, &pdd->qpd); + + if (r) + return r; + } + + decrement_queue_count(dqm, &pdd->qpd, q); + q->properties.is_active = false; + } + + return 0; +} + +/* resume_single_queue does not lock the dqm like the functions + * restore_process_queues_cpsch or restore_process_queues_nocpsch. You should + * lock the dqm before calling, and unlock after calling. + * + * The reason we don't lock the dqm is because this function may be + * called on multiple queues in a loop, so rather than locking/unlocking + * multiple times, we will just keep the dqm locked for all of the calls. + */ +static int resume_single_queue(struct device_queue_manager *dqm, + struct qcm_process_device *qpd, + struct queue *q) +{ + struct kfd_process_device *pdd; + + if (!q->properties.is_suspended) + return 0; + + pdd = qpd_to_pdd(qpd); + + pr_debug("Restoring from suspend PASID %u queue [%i]\n", + pdd->process->pasid, + q->properties.queue_id); + + q->properties.is_suspended = false; + + if (QUEUE_IS_ACTIVE(q->properties)) { + if (dqm->dev->kfd->shared_resources.enable_mes) { + int r = add_queue_mes(dqm, q, &pdd->qpd); + + if (r) + return r; + } + + q->properties.is_active = true; + increment_queue_count(dqm, qpd, q); + } + + return 0; +} + static int evict_process_queues_nocpsch(struct device_queue_manager *dqm, struct qcm_process_device *qpd) { @@ -1926,6 +2012,31 @@ static int execute_queues_cpsch(struct device_queue_manager *dqm, return map_queues_cpsch(dqm); } +static int wait_on_destroy_queue(struct device_queue_manager *dqm, + struct queue *q) +{ + struct kfd_process_device *pdd = kfd_get_process_device_data(q->device, + q->process); + int ret = 0; + + if (pdd->qpd.is_debug) + return ret; + + q->properties.is_being_destroyed = true; + + if (pdd->process->debug_trap_enabled && q->properties.is_suspended) { + dqm_unlock(dqm); + mutex_unlock(&q->process->mutex); + ret = wait_event_interruptible(dqm->destroy_wait, + !q->properties.is_suspended); + + mutex_lock(&q->process->mutex); + dqm_lock(dqm); + } + + return ret; +} + static int destroy_queue_cpsch(struct device_queue_manager *dqm, struct qcm_process_device *qpd, struct queue *q) @@ -1945,11 +2056,16 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, q->properties.queue_id); } - retval = 0; - /* remove queue from list to prevent rescheduling after preemption */ dqm_lock(dqm); + retval = wait_on_destroy_queue(dqm, q); + + if (retval) { + dqm_unlock(dqm); + return retval; + } + if (qpd->is_debug) { /* * error, currently we do not allow to destroy a queue @@ -1996,7 +2112,14 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, dqm_unlock(dqm); - /* Do free_mqd after dqm_unlock(dqm) to avoid circular locking */ + /* + * Do free_mqd and raise delete event after dqm_unlock(dqm) to avoid + * circular locking + */ + kfd_dbg_ev_raise(KFD_EC_MASK(EC_DEVICE_QUEUE_DELETE), + qpd->pqm->process, q->device, + -1, false, NULL, 0); + mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); return retval; @@ -2461,8 +2584,10 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_node *dev) goto out_free; } - if (!dqm->ops.initialize(dqm)) + if (!dqm->ops.initialize(dqm)) { + init_waitqueue_head(&dqm->destroy_wait); return dqm; + } out_free: kfree(dqm); @@ -2602,6 +2727,320 @@ out_unlock: return r; } +#define QUEUE_NOT_FOUND -1 +/* invalidate queue operation in array */ +static void q_array_invalidate(uint32_t num_queues, uint32_t *queue_ids) +{ + int i; + + for (i = 0; i < num_queues; i++) + queue_ids[i] |= KFD_DBG_QUEUE_INVALID_MASK; +} + +/* find queue index in array */ +static int q_array_get_index(unsigned int queue_id, + uint32_t num_queues, + uint32_t *queue_ids) +{ + int i; + + for (i = 0; i < num_queues; i++) + if (queue_id == (queue_ids[i] & ~KFD_DBG_QUEUE_INVALID_MASK)) + return i; + + return QUEUE_NOT_FOUND; +} + +struct copy_context_work_handler_workarea { + struct work_struct copy_context_work; + struct kfd_process *p; +}; + +static void copy_context_work_handler (struct work_struct *work) +{ + struct copy_context_work_handler_workarea *workarea; + struct mqd_manager *mqd_mgr; + struct queue *q; + struct mm_struct *mm; + struct kfd_process *p; + uint32_t tmp_ctl_stack_used_size, tmp_save_area_used_size; + int i; + + workarea = container_of(work, + struct copy_context_work_handler_workarea, + copy_context_work); + + p = workarea->p; + mm = get_task_mm(p->lead_thread); + + if (!mm) + return; + + kthread_use_mm(mm); + for (i = 0; i < p->n_pdds; i++) { + struct kfd_process_device *pdd = p->pdds[i]; + struct device_queue_manager *dqm = pdd->dev->dqm; + struct qcm_process_device *qpd = &pdd->qpd; + + list_for_each_entry(q, &qpd->queues_list, list) { + mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_CP]; + + /* We ignore the return value from get_wave_state + * because + * i) right now, it always returns 0, and + * ii) if we hit an error, we would continue to the + * next queue anyway. + */ + mqd_mgr->get_wave_state(mqd_mgr, + q->mqd, + &q->properties, + (void __user *) q->properties.ctx_save_restore_area_address, + &tmp_ctl_stack_used_size, + &tmp_save_area_used_size); + } + } + kthread_unuse_mm(mm); + mmput(mm); +} + +static uint32_t *get_queue_ids(uint32_t num_queues, uint32_t *usr_queue_id_array) +{ + size_t array_size = num_queues * sizeof(uint32_t); + uint32_t *queue_ids = NULL; + + if (!usr_queue_id_array) + return NULL; + + queue_ids = kzalloc(array_size, GFP_KERNEL); + if (!queue_ids) + return ERR_PTR(-ENOMEM); + + if (copy_from_user(queue_ids, usr_queue_id_array, array_size)) + return ERR_PTR(-EFAULT); + + return queue_ids; +} + +int resume_queues(struct kfd_process *p, + uint32_t num_queues, + uint32_t *usr_queue_id_array) +{ + uint32_t *queue_ids = NULL; + int total_resumed = 0; + int i; + + if (usr_queue_id_array) { + queue_ids = get_queue_ids(num_queues, usr_queue_id_array); + + if (IS_ERR(queue_ids)) + return PTR_ERR(queue_ids); + + /* mask all queues as invalid. unmask per successful request */ + q_array_invalidate(num_queues, queue_ids); + } + + for (i = 0; i < p->n_pdds; i++) { + struct kfd_process_device *pdd = p->pdds[i]; + struct device_queue_manager *dqm = pdd->dev->dqm; + struct qcm_process_device *qpd = &pdd->qpd; + struct queue *q; + int r, per_device_resumed = 0; + + dqm_lock(dqm); + + /* unmask queues that resume or already resumed as valid */ + list_for_each_entry(q, &qpd->queues_list, list) { + int q_idx = QUEUE_NOT_FOUND; + + if (queue_ids) + q_idx = q_array_get_index( + q->properties.queue_id, + num_queues, + queue_ids); + + if (!queue_ids || q_idx != QUEUE_NOT_FOUND) { + int err = resume_single_queue(dqm, &pdd->qpd, q); + + if (queue_ids) { + if (!err) { + queue_ids[q_idx] &= + ~KFD_DBG_QUEUE_INVALID_MASK; + } else { + queue_ids[q_idx] |= + KFD_DBG_QUEUE_ERROR_MASK; + break; + } + } + + if (dqm->dev->kfd->shared_resources.enable_mes) { + wake_up_all(&dqm->destroy_wait); + if (!err) + total_resumed++; + } else { + per_device_resumed++; + } + } + } + + if (!per_device_resumed) { + dqm_unlock(dqm); + continue; + } + + r = execute_queues_cpsch(dqm, + KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, + 0, + USE_DEFAULT_GRACE_PERIOD); + if (r) { + pr_err("Failed to resume process queues\n"); + if (queue_ids) { + list_for_each_entry(q, &qpd->queues_list, list) { + int q_idx = q_array_get_index( + q->properties.queue_id, + num_queues, + queue_ids); + + /* mask queue as error on resume fail */ + if (q_idx != QUEUE_NOT_FOUND) + queue_ids[q_idx] |= + KFD_DBG_QUEUE_ERROR_MASK; + } + } + } else { + wake_up_all(&dqm->destroy_wait); + total_resumed += per_device_resumed; + } + + dqm_unlock(dqm); + } + + if (queue_ids) { + if (copy_to_user((void __user *)usr_queue_id_array, queue_ids, + num_queues * sizeof(uint32_t))) + pr_err("copy_to_user failed on queue resume\n"); + + kfree(queue_ids); + } + + return total_resumed; +} + +int suspend_queues(struct kfd_process *p, + uint32_t num_queues, + uint32_t grace_period, + uint64_t exception_clear_mask, + uint32_t *usr_queue_id_array) +{ + uint32_t *queue_ids = get_queue_ids(num_queues, usr_queue_id_array); + int total_suspended = 0; + int i; + + if (IS_ERR(queue_ids)) + return PTR_ERR(queue_ids); + + /* mask all queues as invalid. umask on successful request */ + q_array_invalidate(num_queues, queue_ids); + + for (i = 0; i < p->n_pdds; i++) { + struct kfd_process_device *pdd = p->pdds[i]; + struct device_queue_manager *dqm = pdd->dev->dqm; + struct qcm_process_device *qpd = &pdd->qpd; + struct queue *q; + int r, per_device_suspended = 0; + + mutex_lock(&p->event_mutex); + dqm_lock(dqm); + + /* unmask queues that suspend or already suspended */ + list_for_each_entry(q, &qpd->queues_list, list) { + int q_idx = q_array_get_index(q->properties.queue_id, + num_queues, + queue_ids); + + if (q_idx != QUEUE_NOT_FOUND) { + int err = suspend_single_queue(dqm, pdd, q); + bool is_mes = dqm->dev->kfd->shared_resources.enable_mes; + + if (!err) { + queue_ids[q_idx] &= ~KFD_DBG_QUEUE_INVALID_MASK; + if (exception_clear_mask && is_mes) + q->properties.exception_status &= + ~exception_clear_mask; + + if (is_mes) + total_suspended++; + else + per_device_suspended++; + } else if (err != -EBUSY) { + r = err; + queue_ids[q_idx] |= KFD_DBG_QUEUE_ERROR_MASK; + break; + } + } + } + + if (!per_device_suspended) { + dqm_unlock(dqm); + mutex_unlock(&p->event_mutex); + if (total_suspended) + amdgpu_amdkfd_debug_mem_fence(dqm->dev->adev); + continue; + } + + r = execute_queues_cpsch(dqm, + KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, + grace_period); + + if (r) + pr_err("Failed to suspend process queues.\n"); + else + total_suspended += per_device_suspended; + + list_for_each_entry(q, &qpd->queues_list, list) { + int q_idx = q_array_get_index(q->properties.queue_id, + num_queues, queue_ids); + + if (q_idx == QUEUE_NOT_FOUND) + continue; + + /* mask queue as error on suspend fail */ + if (r) + queue_ids[q_idx] |= KFD_DBG_QUEUE_ERROR_MASK; + else if (exception_clear_mask) + q->properties.exception_status &= + ~exception_clear_mask; + } + + dqm_unlock(dqm); + mutex_unlock(&p->event_mutex); + amdgpu_device_flush_hdp(dqm->dev->adev, NULL); + } + + if (total_suspended) { + struct copy_context_work_handler_workarea copy_context_worker; + + INIT_WORK_ONSTACK( + ©_context_worker.copy_context_work, + copy_context_work_handler); + + copy_context_worker.p = p; + + schedule_work(©_context_worker.copy_context_work); + + + flush_work(©_context_worker.copy_context_work); + destroy_work_on_stack(©_context_worker.copy_context_work); + } + + if (copy_to_user((void __user *)usr_queue_id_array, queue_ids, + num_queues * sizeof(uint32_t))) + pr_err("copy_to_user failed on queue suspend\n"); + + kfree(queue_ids); + + return total_suspended; +} + int debug_lock_and_unmap(struct device_queue_manager *dqm) { int r; |