diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 274 |
1 files changed, 204 insertions, 70 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c index 045a229436a0..b1a6eb349bb3 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c @@ -1,5 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT /* - * Copyright 2014 Advanced Micro Devices, Inc. + * Copyright 2014-2022 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -27,6 +28,11 @@ #include "kfd_kernel_queue.h" #include "kfd_priv.h" +#define OVER_SUBSCRIPTION_PROCESS_COUNT (1 << 0) +#define OVER_SUBSCRIPTION_COMPUTE_QUEUE_COUNT (1 << 1) +#define OVER_SUBSCRIPTION_GWS_QUEUE_COUNT (1 << 2) +#define OVER_SUBSCRIPTION_XNACK_CONFLICT (1 << 3) + static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes, unsigned int buffer_size_bytes) { @@ -39,32 +45,41 @@ static inline void inc_wptr(unsigned int *wptr, unsigned int increment_bytes, static void pm_calc_rlib_size(struct packet_manager *pm, unsigned int *rlib_size, - bool *over_subscription) + int *over_subscription, + int xnack_conflict) { - unsigned int process_count, queue_count, compute_queue_count; + unsigned int process_count, queue_count, compute_queue_count, gws_queue_count; unsigned int map_queue_size; unsigned int max_proc_per_quantum = 1; - struct kfd_dev *dev = pm->dqm->dev; + struct kfd_node *node = pm->dqm->dev; + struct device *dev = node->adev->dev; process_count = pm->dqm->processes_count; - queue_count = pm->dqm->queue_count; - compute_queue_count = queue_count - pm->dqm->sdma_queue_count; + queue_count = pm->dqm->active_queue_count; + compute_queue_count = pm->dqm->active_cp_queue_count; + gws_queue_count = pm->dqm->gws_queue_count; /* check if there is over subscription * Note: the arbitration between the number of VMIDs and * hws_max_conc_proc has been done in * kgd2kfd_device_init(). */ - *over_subscription = false; + *over_subscription = 0; - if (dev->max_proc_per_quantum > 1) - max_proc_per_quantum = dev->max_proc_per_quantum; + if (node->max_proc_per_quantum > 1) + max_proc_per_quantum = node->max_proc_per_quantum; - if ((process_count > max_proc_per_quantum) || - compute_queue_count > get_queues_num(pm->dqm)) { - *over_subscription = true; - pr_debug("Over subscribed runlist\n"); - } + if (process_count > max_proc_per_quantum) + *over_subscription |= OVER_SUBSCRIPTION_PROCESS_COUNT; + if (compute_queue_count > get_cp_queues_num(pm->dqm)) + *over_subscription |= OVER_SUBSCRIPTION_COMPUTE_QUEUE_COUNT; + if (gws_queue_count > 1) + *over_subscription |= OVER_SUBSCRIPTION_GWS_QUEUE_COUNT; + if (xnack_conflict && (node->adev->gmc.xnack_flags & AMDGPU_GMC_XNACK_FLAG_CHAIN)) + *over_subscription |= OVER_SUBSCRIPTION_XNACK_CONFLICT; + + if (*over_subscription) + dev_dbg(dev, "Over subscribed runlist\n"); map_queue_size = pm->pmf->map_queues_size; /* calculate run list ib allocation size */ @@ -78,29 +93,32 @@ static void pm_calc_rlib_size(struct packet_manager *pm, if (*over_subscription) *rlib_size += pm->pmf->runlist_size; - pr_debug("runlist ib size %d\n", *rlib_size); + dev_dbg(dev, "runlist ib size %d\n", *rlib_size); } static int pm_allocate_runlist_ib(struct packet_manager *pm, unsigned int **rl_buffer, uint64_t *rl_gpu_buffer, unsigned int *rl_buffer_size, - bool *is_over_subscription) + int *is_over_subscription, + int xnack_conflict) { + struct kfd_node *node = pm->dqm->dev; + struct device *dev = node->adev->dev; int retval; if (WARN_ON(pm->allocated)) return -EINVAL; - pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription); + pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription, + xnack_conflict); mutex_lock(&pm->lock); - retval = kfd_gtt_sa_allocate(pm->dqm->dev, *rl_buffer_size, - &pm->ib_buffer_obj); + retval = kfd_gtt_sa_allocate(node, *rl_buffer_size, &pm->ib_buffer_obj); if (retval) { - pr_err("Failed to allocate runlist IB\n"); + dev_err(dev, "Failed to allocate runlist IB\n"); goto out; } @@ -122,32 +140,54 @@ static int pm_create_runlist_ib(struct packet_manager *pm, { unsigned int alloc_size_bytes; unsigned int *rl_buffer, rl_wptr, i; - int retval, proccesses_mapped; + struct kfd_node *node = pm->dqm->dev; + struct device *dev = node->adev->dev; + int retval, processes_mapped; struct device_process_node *cur; struct qcm_process_device *qpd; struct queue *q; struct kernel_queue *kq; - bool is_over_subscription; + int is_over_subscription; + int xnack_enabled = -1; + bool xnack_conflict = 0; + + rl_wptr = retval = processes_mapped = 0; - rl_wptr = retval = proccesses_mapped = 0; + /* Check if processes set different xnack modes */ + list_for_each_entry(cur, queues, list) { + qpd = cur->qpd; + if (xnack_enabled < 0) + /* First process */ + xnack_enabled = qpd->pqm->process->xnack_enabled; + else if (qpd->pqm->process->xnack_enabled != xnack_enabled) { + /* Found a process with a different xnack mode */ + xnack_conflict = 1; + break; + } + } retval = pm_allocate_runlist_ib(pm, &rl_buffer, rl_gpu_addr, - &alloc_size_bytes, &is_over_subscription); + &alloc_size_bytes, &is_over_subscription, + xnack_conflict); if (retval) return retval; *rl_size_bytes = alloc_size_bytes; pm->ib_size_bytes = alloc_size_bytes; - pr_debug("Building runlist ib process count: %d queues count %d\n", - pm->dqm->processes_count, pm->dqm->queue_count); + dev_dbg(dev, "Building runlist ib process count: %d queues count %d\n", + pm->dqm->processes_count, pm->dqm->active_queue_count); +build_runlist_ib: /* build the run list ib packet */ list_for_each_entry(cur, queues, list) { qpd = cur->qpd; + /* group processes with the same xnack mode together */ + if (qpd->pqm->process->xnack_enabled != xnack_enabled) + continue; /* build map process packet */ - if (proccesses_mapped >= pm->dqm->processes_count) { - pr_debug("Not enough space left in runlist IB\n"); + if (processes_mapped >= pm->dqm->processes_count) { + dev_dbg(dev, "Not enough space left in runlist IB\n"); pm_release_ib(pm); return -ENOMEM; } @@ -156,7 +196,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm, if (retval) return retval; - proccesses_mapped++; + processes_mapped++; inc_wptr(&rl_wptr, pm->pmf->map_process_size, alloc_size_bytes); @@ -164,7 +204,8 @@ static int pm_create_runlist_ib(struct packet_manager *pm, if (!kq->queue->properties.is_active) continue; - pr_debug("static_queue, mapping kernel q %d, is debug status %d\n", + dev_dbg(dev, + "static_queue, mapping kernel q %d, is debug status %d\n", kq->queue->queue, qpd->is_debug); retval = pm->pmf->map_queues(pm, @@ -183,7 +224,8 @@ static int pm_create_runlist_ib(struct packet_manager *pm, if (!q->properties.is_active) continue; - pr_debug("static_queue, mapping user queue %d, is debug status %d\n", + dev_dbg(dev, + "static_queue, mapping user queue %d, is debug status %d\n", q->queue, qpd->is_debug); retval = pm->pmf->map_queues(pm, @@ -199,14 +241,33 @@ static int pm_create_runlist_ib(struct packet_manager *pm, alloc_size_bytes); } } + if (xnack_conflict) { + /* pick up processes with the other xnack mode */ + xnack_enabled = !xnack_enabled; + xnack_conflict = 0; + goto build_runlist_ib; + } - pr_debug("Finished map process and queues to runlist\n"); + dev_dbg(dev, "Finished map process and queues to runlist\n"); + + if (is_over_subscription) { + if (!pm->is_over_subscription) + dev_warn(dev, "Runlist is getting oversubscribed due to%s%s%s%s. Expect reduced ROCm performance.\n", + is_over_subscription & OVER_SUBSCRIPTION_PROCESS_COUNT ? + " too many processes" : "", + is_over_subscription & OVER_SUBSCRIPTION_COMPUTE_QUEUE_COUNT ? + " too many queues" : "", + is_over_subscription & OVER_SUBSCRIPTION_GWS_QUEUE_COUNT ? + " multiple processes using cooperative launch" : "", + is_over_subscription & OVER_SUBSCRIPTION_XNACK_CONFLICT ? + " xnack on/off processes mixed on gfx9" : ""); - if (is_over_subscription) retval = pm->pmf->runlist(pm, &rl_buffer[rl_wptr], *rl_gpu_addr, alloc_size_bytes / sizeof(uint32_t), true); + } + pm->is_over_subscription = !!is_over_subscription; for (i = 0; i < alloc_size_bytes / sizeof(uint32_t); i++) pr_debug("0x%2X ", rl_buffer[i]); @@ -217,7 +278,7 @@ static int pm_create_runlist_ib(struct packet_manager *pm, int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) { - switch (dqm->dev->device_info->asic_family) { + switch (dqm->dev->adev->asic_type) { case CHIP_KAVERI: case CHIP_HAWAII: /* PM4 packet structures on CIK are the same as on VI */ @@ -227,18 +288,22 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) case CHIP_POLARIS10: case CHIP_POLARIS11: case CHIP_POLARIS12: + case CHIP_VEGAM: pm->pmf = &kfd_vi_pm_funcs; break; - case CHIP_VEGA10: - case CHIP_VEGA12: - case CHIP_VEGA20: - case CHIP_RAVEN: - pm->pmf = &kfd_v9_pm_funcs; - break; default: - WARN(1, "Unexpected ASIC family %u", - dqm->dev->device_info->asic_family); - return -EINVAL; + if (KFD_GC_VERSION(dqm->dev) == IP_VERSION(9, 4, 2) || + KFD_GC_VERSION(dqm->dev) == IP_VERSION(9, 4, 3) || + KFD_GC_VERSION(dqm->dev) == IP_VERSION(9, 4, 4) || + KFD_GC_VERSION(dqm->dev) == IP_VERSION(9, 5, 0)) + pm->pmf = &kfd_aldebaran_pm_funcs; + else if (KFD_GC_VERSION(dqm->dev) >= IP_VERSION(9, 0, 1)) + pm->pmf = &kfd_v9_pm_funcs; + else { + WARN(1, "Unexpected ASIC family %u", + dqm->dev->adev->asic_type); + return -EINVAL; + } } pm->dqm = dqm; @@ -257,30 +322,33 @@ void pm_uninit(struct packet_manager *pm) { mutex_destroy(&pm->lock); kernel_queue_uninit(pm->priv_queue); + pm->priv_queue = NULL; } int pm_send_set_resources(struct packet_manager *pm, struct scheduling_resources *res) { + struct kfd_node *node = pm->dqm->dev; + struct device *dev = node->adev->dev; uint32_t *buffer, size; int retval = 0; size = pm->pmf->set_resources_size; mutex_lock(&pm->lock); - pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, + kq_acquire_packet_buffer(pm->priv_queue, size / sizeof(uint32_t), (unsigned int **)&buffer); if (!buffer) { - pr_err("Failed to allocate buffer on kernel queue\n"); + dev_err(dev, "Failed to allocate buffer on kernel queue\n"); retval = -ENOMEM; goto out; } retval = pm->pmf->set_resources(pm, buffer, res); if (!retval) - pm->priv_queue->ops.submit_packet(pm->priv_queue); + retval = kq_submit_packet(pm->priv_queue); else - pm->priv_queue->ops.rollback_packet(pm->priv_queue); + kq_rollback_packet(pm->priv_queue); out: mutex_unlock(&pm->lock); @@ -305,7 +373,7 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues) packet_size_dwords = pm->pmf->runlist_size / sizeof(uint32_t); mutex_lock(&pm->lock); - retval = pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, + retval = kq_acquire_packet_buffer(pm->priv_queue, packet_size_dwords, &rl_buffer); if (retval) goto fail_acquire_packet_buffer; @@ -315,14 +383,14 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues) if (retval) goto fail_create_runlist; - pm->priv_queue->ops.submit_packet(pm->priv_queue); + retval = kq_submit_packet(pm->priv_queue); mutex_unlock(&pm->lock); return retval; fail_create_runlist: - pm->priv_queue->ops.rollback_packet(pm->priv_queue); + kq_rollback_packet(pm->priv_queue); fail_acquire_packet_buffer: mutex_unlock(&pm->lock); fail_create_runlist_ib: @@ -331,8 +399,10 @@ fail_create_runlist_ib: } int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, - uint32_t fence_value) + uint64_t fence_value) { + struct kfd_node *node = pm->dqm->dev; + struct device *dev = node->adev->dev; uint32_t *buffer, size; int retval = 0; @@ -341,49 +411,108 @@ int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, size = pm->pmf->query_status_size; mutex_lock(&pm->lock); - pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, + kq_acquire_packet_buffer(pm->priv_queue, size / sizeof(uint32_t), (unsigned int **)&buffer); if (!buffer) { - pr_err("Failed to allocate buffer on kernel queue\n"); + dev_err(dev, "Failed to allocate buffer on kernel queue\n"); retval = -ENOMEM; goto out; } retval = pm->pmf->query_status(pm, buffer, fence_address, fence_value); if (!retval) - pm->priv_queue->ops.submit_packet(pm->priv_queue); + retval = kq_submit_packet(pm->priv_queue); else - pm->priv_queue->ops.rollback_packet(pm->priv_queue); + kq_rollback_packet(pm->priv_queue); out: mutex_unlock(&pm->lock); return retval; } -int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, +/* pm_config_dequeue_wait_counts: Configure dequeue timer Wait Counts + * by writing to CP_IQ_WAIT_TIME2 registers. + * + * @cmd: See emum kfd_config_dequeue_wait_counts_cmd definition + * @value: Depends on the cmd. This parameter is unused for + * KFD_DEQUEUE_WAIT_INIT and KFD_DEQUEUE_WAIT_RESET. For + * KFD_DEQUEUE_WAIT_SET_SCH_WAVE it holds value to be set + * + */ +int pm_config_dequeue_wait_counts(struct packet_manager *pm, + enum kfd_config_dequeue_wait_counts_cmd cmd, + uint32_t value) +{ + struct kfd_node *node = pm->dqm->dev; + struct device *dev = node->adev->dev; + int retval = 0; + uint32_t *buffer, size; + + if (!pm->pmf->config_dequeue_wait_counts || + !pm->pmf->config_dequeue_wait_counts_size) + return 0; + + if (cmd == KFD_DEQUEUE_WAIT_INIT && (KFD_GC_VERSION(pm->dqm->dev) < IP_VERSION(9, 4, 1) || + KFD_GC_VERSION(pm->dqm->dev) >= IP_VERSION(10, 0, 0))) + return 0; + + size = pm->pmf->config_dequeue_wait_counts_size; + + mutex_lock(&pm->lock); + + if (size) { + kq_acquire_packet_buffer(pm->priv_queue, + size / sizeof(uint32_t), + (unsigned int **)&buffer); + + if (!buffer) { + dev_err(dev, + "Failed to allocate buffer on kernel queue\n"); + retval = -ENOMEM; + goto out; + } + + retval = pm->pmf->config_dequeue_wait_counts(pm, buffer, + cmd, value); + if (!retval) { + retval = kq_submit_packet(pm->priv_queue); + + /* If default value is modified, cache that in dqm->wait_times */ + if (!retval && cmd == KFD_DEQUEUE_WAIT_INIT) + update_dqm_wait_times(pm->dqm); + } else { + kq_rollback_packet(pm->priv_queue); + } + } +out: + mutex_unlock(&pm->lock); + return retval; +} + +int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_unmap_queues_filter filter, - uint32_t filter_param, bool reset, - unsigned int sdma_engine) + uint32_t filter_param, bool reset) { + struct kfd_node *node = pm->dqm->dev; + struct device *dev = node->adev->dev; uint32_t *buffer, size; int retval = 0; size = pm->pmf->unmap_queues_size; mutex_lock(&pm->lock); - pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, + kq_acquire_packet_buffer(pm->priv_queue, size / sizeof(uint32_t), (unsigned int **)&buffer); if (!buffer) { - pr_err("Failed to allocate buffer on kernel queue\n"); + dev_err(dev, "Failed to allocate buffer on kernel queue\n"); retval = -ENOMEM; goto out; } - retval = pm->pmf->unmap_queues(pm, buffer, type, filter, filter_param, - reset, sdma_engine); + retval = pm->pmf->unmap_queues(pm, buffer, filter, filter_param, reset); if (!retval) - pm->priv_queue->ops.submit_packet(pm->priv_queue); + retval = kq_submit_packet(pm->priv_queue); else - pm->priv_queue->ops.rollback_packet(pm->priv_queue); + kq_rollback_packet(pm->priv_queue); out: mutex_unlock(&pm->lock); @@ -423,24 +552,29 @@ out: int pm_debugfs_hang_hws(struct packet_manager *pm) { + struct kfd_node *node = pm->dqm->dev; + struct device *dev = node->adev->dev; uint32_t *buffer, size; int r = 0; + if (!pm->priv_queue) + return -EAGAIN; + size = pm->pmf->query_status_size; mutex_lock(&pm->lock); - pm->priv_queue->ops.acquire_packet_buffer(pm->priv_queue, + kq_acquire_packet_buffer(pm->priv_queue, size / sizeof(uint32_t), (unsigned int **)&buffer); if (!buffer) { - pr_err("Failed to allocate buffer on kernel queue\n"); + dev_err(dev, "Failed to allocate buffer on kernel queue\n"); r = -ENOMEM; goto out; } memset(buffer, 0x55, size); - pm->priv_queue->ops.submit_packet(pm->priv_queue); + kq_submit_packet(pm->priv_queue); - pr_info("Submitting %x %x %x %x %x %x %x to HIQ to hang the HWS.", - buffer[0], buffer[1], buffer[2], buffer[3], - buffer[4], buffer[5], buffer[6]); + dev_info(dev, "Submitting %x %x %x %x %x %x %x to HIQ to hang the HWS.", + buffer[0], buffer[1], buffer[2], buffer[3], buffer[4], + buffer[5], buffer[6]); out: mutex_unlock(&pm->lock); return r; |
