summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c3117
1 files changed, 2056 insertions, 1061 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index be1ab43473c6..b1c24c8fa686 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: MIT
/*
* Copyright 2014-2018 Advanced Micro Devices, Inc.
*
@@ -19,30 +20,37 @@
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
-
-#define pr_fmt(fmt) "kfd2kgd: " fmt
-
+#include <linux/dma-buf.h>
#include <linux/list.h>
#include <linux/pagemap.h>
#include <linux/sched/mm.h>
-#include <linux/dma-buf.h>
-#include <drm/drmP.h>
+#include <linux/sched/task.h>
+#include <drm/ttm/ttm_tt.h>
+
+#include <drm/drm_exec.h>
+
#include "amdgpu_object.h"
+#include "amdgpu_gem.h"
#include "amdgpu_vm.h"
+#include "amdgpu_hmm.h"
#include "amdgpu_amdkfd.h"
-
-/* Special VM and GART address alignment needed for VI pre-Fiji due to
- * a HW bug.
- */
-#define VI_BO_SIZE_ALIGN (0x8000)
-
-/* BO flag to indicate a KFD userptr BO */
-#define AMDGPU_AMDKFD_USERPTR_BO (1ULL << 63)
+#include "amdgpu_dma_buf.h"
+#include <uapi/linux/kfd_ioctl.h>
+#include "amdgpu_xgmi.h"
+#include "kfd_priv.h"
+#include "kfd_smi_events.h"
/* Userptr restore delay, just long enough to allow consecutive VM
* changes to accumulate
*/
#define AMDGPU_USERPTR_RESTORE_DELAY_MS 1
+#define AMDGPU_RESERVE_MEM_LIMIT (3UL << 29)
+
+/*
+ * Align VRAM availability to 2MB to avoid fragmentation caused by 4K allocations in the tail 2MB
+ * BO chunk
+ */
+#define VRAM_AVAILABLITY_ALIGN (1 << 21)
/* Impose limit on how much memory KFD can use */
static struct {
@@ -53,12 +61,6 @@ static struct {
spinlock_t mem_limit_lock;
} kfd_mem_limit;
-/* Struct used for amdgpu_amdkfd_bo_validate */
-struct amdgpu_vm_parser {
- uint32_t domain;
- bool wait;
-};
-
static const char * const domain_bit_to_string[] = {
"CPU",
"GTT",
@@ -72,26 +74,39 @@ static const char * const domain_bit_to_string[] = {
static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work);
-
-static inline struct amdgpu_device *get_amdgpu_device(struct kgd_dev *kgd)
-{
- return (struct amdgpu_device *)kgd;
-}
-
-static bool check_if_add_bo_to_vm(struct amdgpu_vm *avm,
+static bool kfd_mem_is_attached(struct amdgpu_vm *avm,
struct kgd_mem *mem)
{
- struct kfd_bo_va_list *entry;
+ struct kfd_mem_attachment *entry;
- list_for_each_entry(entry, &mem->bo_va_list, bo_list)
+ list_for_each_entry(entry, &mem->attachments, list)
if (entry->bo_va->base.vm == avm)
- return false;
+ return true;
- return true;
+ return false;
+}
+
+/**
+ * reuse_dmamap() - Check whether adev can share the original
+ * userptr BO
+ *
+ * If both adev and bo_adev are in direct mapping or
+ * in the same iommu group, they can share the original BO.
+ *
+ * @adev: Device to which can or cannot share the original BO
+ * @bo_adev: Device to which allocated BO belongs to
+ *
+ * Return: returns true if adev can share original userptr BO,
+ * false otherwise.
+ */
+static bool reuse_dmamap(struct amdgpu_device *adev, struct amdgpu_device *bo_adev)
+{
+ return (adev->ram_is_direct_mapped && bo_adev->ram_is_direct_mapped) ||
+ (adev->dev->iommu_group == bo_adev->dev->iommu_group);
}
/* Set memory usage limits. Current, limits are
- * System (TTM + userptr) memory - 3/4th System RAM
+ * System (TTM + userptr) memory - 15/16th System RAM
* TTM memory - 3/8th System RAM
*/
void amdgpu_amdkfd_gpuvm_init_mem_limits(void)
@@ -99,236 +114,304 @@ void amdgpu_amdkfd_gpuvm_init_mem_limits(void)
struct sysinfo si;
uint64_t mem;
+ if (kfd_mem_limit.max_system_mem_limit)
+ return;
+
si_meminfo(&si);
mem = si.totalram - si.totalhigh;
mem *= si.mem_unit;
spin_lock_init(&kfd_mem_limit.mem_limit_lock);
- kfd_mem_limit.max_system_mem_limit = (mem >> 1) + (mem >> 2);
- kfd_mem_limit.max_ttm_mem_limit = (mem >> 1) - (mem >> 3);
+ kfd_mem_limit.max_system_mem_limit = mem - (mem >> 6);
+ if (kfd_mem_limit.max_system_mem_limit < 2 * AMDGPU_RESERVE_MEM_LIMIT)
+ kfd_mem_limit.max_system_mem_limit >>= 1;
+ else
+ kfd_mem_limit.max_system_mem_limit -= AMDGPU_RESERVE_MEM_LIMIT;
+
+ kfd_mem_limit.max_ttm_mem_limit = ttm_tt_pages_limit() << PAGE_SHIFT;
pr_debug("Kernel memory limit %lluM, TTM limit %lluM\n",
(kfd_mem_limit.max_system_mem_limit >> 20),
(kfd_mem_limit.max_ttm_mem_limit >> 20));
}
-static int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
- uint64_t size, u32 domain, bool sg)
+void amdgpu_amdkfd_reserve_system_mem(uint64_t size)
{
- size_t acc_size, system_mem_needed, ttm_mem_needed, vram_needed;
- uint64_t reserved_for_pt = amdgpu_amdkfd_total_mem_size >> 9;
- int ret = 0;
+ kfd_mem_limit.system_mem_used += size;
+}
+
+/* Estimate page table size needed to represent a given memory size
+ *
+ * With 4KB pages, we need one 8 byte PTE for each 4KB of memory
+ * (factor 512, >> 9). With 2MB pages, we need one 8 byte PTE for 2MB
+ * of memory (factor 256K, >> 18). ROCm user mode tries to optimize
+ * for 2MB pages for TLB efficiency. However, small allocations and
+ * fragmented system memory still need some 4KB pages. We choose a
+ * compromise that should work in most cases without reserving too
+ * much memory for page tables unnecessarily (factor 16K, >> 14).
+ */
- acc_size = ttm_bo_dma_acc_size(&adev->mman.bdev, size,
- sizeof(struct amdgpu_bo));
+#define ESTIMATE_PT_SIZE(mem_size) max(((mem_size) >> 14), AMDGPU_VM_RESERVED_VRAM)
+
+/**
+ * amdgpu_amdkfd_reserve_mem_limit() - Decrease available memory by size
+ * of buffer.
+ *
+ * @adev: Device to which allocated BO belongs to
+ * @size: Size of buffer, in bytes, encapsulated by B0. This should be
+ * equivalent to amdgpu_bo_size(BO)
+ * @alloc_flag: Flag used in allocating a BO as noted above
+ * @xcp_id: xcp_id is used to get xcp from xcp manager, one xcp is
+ * managed as one compute node in driver for app
+ *
+ * Return:
+ * returns -ENOMEM in case of error, ZERO otherwise
+ */
+int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
+ uint64_t size, u32 alloc_flag, int8_t xcp_id)
+{
+ uint64_t reserved_for_pt =
+ ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size);
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ uint64_t reserved_for_ras = (con ? con->reserved_pages_in_bytes : 0);
+ size_t system_mem_needed, ttm_mem_needed, vram_needed;
+ int ret = 0;
+ uint64_t vram_size = 0;
+ system_mem_needed = 0;
+ ttm_mem_needed = 0;
vram_needed = 0;
- if (domain == AMDGPU_GEM_DOMAIN_GTT) {
- /* TTM GTT memory */
- system_mem_needed = acc_size + size;
- ttm_mem_needed = acc_size + size;
- } else if (domain == AMDGPU_GEM_DOMAIN_CPU && !sg) {
- /* Userptr */
- system_mem_needed = acc_size + size;
- ttm_mem_needed = acc_size;
- } else {
- /* VRAM and SG */
- system_mem_needed = acc_size;
- ttm_mem_needed = acc_size;
- if (domain == AMDGPU_GEM_DOMAIN_VRAM)
- vram_needed = size;
+ if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {
+ system_mem_needed = size;
+ ttm_mem_needed = size;
+ } else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
+ /*
+ * Conservatively round up the allocation requirement to 2 MB
+ * to avoid fragmentation caused by 4K allocations in the tail
+ * 2M BO chunk.
+ */
+ vram_needed = size;
+ /*
+ * For GFX 9.4.3, get the VRAM size from XCP structs
+ */
+ if (WARN_ONCE(xcp_id < 0, "invalid XCP ID %d", xcp_id))
+ return -EINVAL;
+
+ vram_size = KFD_XCP_MEMORY_SIZE(adev, xcp_id);
+ if (adev->apu_prefer_gtt) {
+ system_mem_needed = size;
+ ttm_mem_needed = size;
+ }
+ } else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
+ system_mem_needed = size;
+ } else if (!(alloc_flag &
+ (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |
+ KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP))) {
+ pr_err("%s: Invalid BO type %#x\n", __func__, alloc_flag);
+ return -ENOMEM;
}
spin_lock(&kfd_mem_limit.mem_limit_lock);
- if ((kfd_mem_limit.system_mem_used + system_mem_needed >
- kfd_mem_limit.max_system_mem_limit) ||
- (kfd_mem_limit.ttm_mem_used + ttm_mem_needed >
- kfd_mem_limit.max_ttm_mem_limit) ||
- (adev->kfd.vram_used + vram_needed >
- adev->gmc.real_vram_size - reserved_for_pt)) {
+ if (kfd_mem_limit.system_mem_used + system_mem_needed >
+ kfd_mem_limit.max_system_mem_limit) {
+ pr_debug("Set no_system_mem_limit=1 if using shared memory\n");
+ if (!no_system_mem_limit) {
+ ret = -ENOMEM;
+ goto release;
+ }
+ }
+
+ if (kfd_mem_limit.ttm_mem_used + ttm_mem_needed >
+ kfd_mem_limit.max_ttm_mem_limit) {
ret = -ENOMEM;
- } else {
- kfd_mem_limit.system_mem_used += system_mem_needed;
- kfd_mem_limit.ttm_mem_used += ttm_mem_needed;
- adev->kfd.vram_used += vram_needed;
+ goto release;
}
+ /*if is_app_apu is false and apu_prefer_gtt is true, it is an APU with
+ * carve out < gtt. In that case, VRAM allocation will go to gtt domain, skip
+ * VRAM check since ttm_mem_limit check already cover this allocation
+ */
+
+ if (adev && xcp_id >= 0 && (!adev->apu_prefer_gtt || adev->gmc.is_app_apu)) {
+ uint64_t vram_available =
+ vram_size - reserved_for_pt - reserved_for_ras -
+ atomic64_read(&adev->vram_pin_size);
+ if (adev->kfd.vram_used[xcp_id] + vram_needed > vram_available) {
+ ret = -ENOMEM;
+ goto release;
+ }
+ }
+
+ /* Update memory accounting by decreasing available system
+ * memory, TTM memory and GPU memory as computed above
+ */
+ WARN_ONCE(vram_needed && !adev,
+ "adev reference can't be null when vram is used");
+ if (adev && xcp_id >= 0) {
+ adev->kfd.vram_used[xcp_id] += vram_needed;
+ adev->kfd.vram_used_aligned[xcp_id] +=
+ adev->apu_prefer_gtt ?
+ vram_needed :
+ ALIGN(vram_needed, VRAM_AVAILABLITY_ALIGN);
+ }
+ kfd_mem_limit.system_mem_used += system_mem_needed;
+ kfd_mem_limit.ttm_mem_used += ttm_mem_needed;
+
+release:
spin_unlock(&kfd_mem_limit.mem_limit_lock);
return ret;
}
-static void unreserve_mem_limit(struct amdgpu_device *adev,
- uint64_t size, u32 domain, bool sg)
+void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
+ uint64_t size, u32 alloc_flag, int8_t xcp_id)
{
- size_t acc_size;
-
- acc_size = ttm_bo_dma_acc_size(&adev->mman.bdev, size,
- sizeof(struct amdgpu_bo));
-
spin_lock(&kfd_mem_limit.mem_limit_lock);
- if (domain == AMDGPU_GEM_DOMAIN_GTT) {
- kfd_mem_limit.system_mem_used -= (acc_size + size);
- kfd_mem_limit.ttm_mem_used -= (acc_size + size);
- } else if (domain == AMDGPU_GEM_DOMAIN_CPU && !sg) {
- kfd_mem_limit.system_mem_used -= (acc_size + size);
- kfd_mem_limit.ttm_mem_used -= acc_size;
- } else {
- kfd_mem_limit.system_mem_used -= acc_size;
- kfd_mem_limit.ttm_mem_used -= acc_size;
- if (domain == AMDGPU_GEM_DOMAIN_VRAM) {
- adev->kfd.vram_used -= size;
- WARN_ONCE(adev->kfd.vram_used < 0,
- "kfd VRAM memory accounting unbalanced");
+
+ if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {
+ kfd_mem_limit.system_mem_used -= size;
+ kfd_mem_limit.ttm_mem_used -= size;
+ } else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
+ WARN_ONCE(!adev,
+ "adev reference can't be null when alloc mem flags vram is set");
+ if (WARN_ONCE(xcp_id < 0, "invalid XCP ID %d", xcp_id))
+ goto release;
+
+ if (adev) {
+ adev->kfd.vram_used[xcp_id] -= size;
+ if (adev->apu_prefer_gtt) {
+ adev->kfd.vram_used_aligned[xcp_id] -= size;
+ kfd_mem_limit.system_mem_used -= size;
+ kfd_mem_limit.ttm_mem_used -= size;
+ } else {
+ adev->kfd.vram_used_aligned[xcp_id] -=
+ ALIGN(size, VRAM_AVAILABLITY_ALIGN);
+ }
}
- }
- WARN_ONCE(kfd_mem_limit.system_mem_used < 0,
- "kfd system memory accounting unbalanced");
+ } else if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
+ kfd_mem_limit.system_mem_used -= size;
+ } else if (!(alloc_flag &
+ (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |
+ KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP))) {
+ pr_err("%s: Invalid BO type %#x\n", __func__, alloc_flag);
+ goto release;
+ }
+ WARN_ONCE(adev && xcp_id >= 0 && adev->kfd.vram_used[xcp_id] < 0,
+ "KFD VRAM memory accounting unbalanced for xcp: %d", xcp_id);
WARN_ONCE(kfd_mem_limit.ttm_mem_used < 0,
- "kfd TTM memory accounting unbalanced");
+ "KFD TTM memory accounting unbalanced");
+ WARN_ONCE(kfd_mem_limit.system_mem_used < 0,
+ "KFD system memory accounting unbalanced");
+release:
spin_unlock(&kfd_mem_limit.mem_limit_lock);
}
-void amdgpu_amdkfd_unreserve_memory_limit(struct amdgpu_bo *bo)
+void amdgpu_amdkfd_release_notify(struct amdgpu_bo *bo)
{
struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
- u32 domain = bo->preferred_domains;
- bool sg = (bo->preferred_domains == AMDGPU_GEM_DOMAIN_CPU);
+ u32 alloc_flags = bo->kfd_bo->alloc_flags;
+ u64 size = amdgpu_bo_size(bo);
- if (bo->flags & AMDGPU_AMDKFD_USERPTR_BO) {
- domain = AMDGPU_GEM_DOMAIN_CPU;
- sg = false;
- }
+ amdgpu_amdkfd_unreserve_mem_limit(adev, size, alloc_flags,
+ bo->xcp_id);
- unreserve_mem_limit(adev, amdgpu_bo_size(bo), domain, sg);
+ kfree(bo->kfd_bo);
}
+/**
+ * create_dmamap_sg_bo() - Creates a amdgpu_bo object to reflect information
+ * about USERPTR or DOOREBELL or MMIO BO.
+ *
+ * @adev: Device for which dmamap BO is being created
+ * @mem: BO of peer device that is being DMA mapped. Provides parameters
+ * in building the dmamap BO
+ * @bo_out: Output parameter updated with handle of dmamap BO
+ */
+static int
+create_dmamap_sg_bo(struct amdgpu_device *adev,
+ struct kgd_mem *mem, struct amdgpu_bo **bo_out)
+{
+ struct drm_gem_object *gem_obj;
+ int ret;
+ uint64_t flags = 0;
+
+ ret = amdgpu_bo_reserve(mem->bo, false);
+ if (ret)
+ return ret;
+
+ if (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR)
+ flags |= mem->bo->flags & (AMDGPU_GEM_CREATE_COHERENT |
+ AMDGPU_GEM_CREATE_UNCACHED);
+
+ ret = amdgpu_gem_object_create(adev, mem->bo->tbo.base.size, 1,
+ AMDGPU_GEM_DOMAIN_CPU, AMDGPU_GEM_CREATE_PREEMPTIBLE | flags,
+ ttm_bo_type_sg, mem->bo->tbo.base.resv, &gem_obj, 0);
-/* amdgpu_amdkfd_remove_eviction_fence - Removes eviction fence(s) from BO's
+ amdgpu_bo_unreserve(mem->bo);
+
+ if (ret) {
+ pr_err("Error in creating DMA mappable SG BO on domain: %d\n", ret);
+ return -EINVAL;
+ }
+
+ *bo_out = gem_to_amdgpu_bo(gem_obj);
+ (*bo_out)->parent = amdgpu_bo_ref(mem->bo);
+ return ret;
+}
+
+/* amdgpu_amdkfd_remove_eviction_fence - Removes eviction fence from BO's
* reservation object.
*
* @bo: [IN] Remove eviction fence(s) from this BO
- * @ef: [IN] If ef is specified, then this eviction fence is removed if it
+ * @ef: [IN] This eviction fence is removed if it
* is present in the shared list.
- * @ef_list: [OUT] Returns list of eviction fences. These fences are removed
- * from BO's reservation object shared list.
- * @ef_count: [OUT] Number of fences in ef_list.
*
- * NOTE: If called with ef_list, then amdgpu_amdkfd_add_eviction_fence must be
- * called to restore the eviction fences and to avoid memory leak. This is
- * useful for shared BOs.
* NOTE: Must be called with BO reserved i.e. bo->tbo.resv->lock held.
*/
static int amdgpu_amdkfd_remove_eviction_fence(struct amdgpu_bo *bo,
- struct amdgpu_amdkfd_fence *ef,
- struct amdgpu_amdkfd_fence ***ef_list,
- unsigned int *ef_count)
+ struct amdgpu_amdkfd_fence *ef)
{
- struct reservation_object *resv = bo->tbo.resv;
- struct reservation_object_list *old, *new;
- unsigned int i, j, k;
+ struct dma_fence *replacement;
- if (!ef && !ef_list)
+ if (!ef)
return -EINVAL;
- if (ef_list) {
- *ef_list = NULL;
- *ef_count = 0;
- }
-
- old = reservation_object_get_list(resv);
- if (!old)
- return 0;
-
- new = kmalloc(offsetof(typeof(*new), shared[old->shared_max]),
- GFP_KERNEL);
- if (!new)
- return -ENOMEM;
-
- /* Go through all the shared fences in the resevation object and sort
- * the interesting ones to the end of the list.
+ /* TODO: Instead of block before we should use the fence of the page
+ * table update and TLB flush here directly.
*/
- for (i = 0, j = old->shared_count, k = 0; i < old->shared_count; ++i) {
- struct dma_fence *f;
-
- f = rcu_dereference_protected(old->shared[i],
- reservation_object_held(resv));
-
- if ((ef && f->context == ef->base.context) ||
- (!ef && to_amdgpu_amdkfd_fence(f)))
- RCU_INIT_POINTER(new->shared[--j], f);
- else
- RCU_INIT_POINTER(new->shared[k++], f);
- }
- new->shared_max = old->shared_max;
- new->shared_count = k;
-
- if (!ef) {
- unsigned int count = old->shared_count - j;
-
- /* Alloc memory for count number of eviction fence pointers.
- * Fill the ef_list array and ef_count
- */
- *ef_list = kcalloc(count, sizeof(**ef_list), GFP_KERNEL);
- *ef_count = count;
-
- if (!*ef_list) {
- kfree(new);
- return -ENOMEM;
- }
- }
-
- /* Install the new fence list, seqcount provides the barriers */
- preempt_disable();
- write_seqcount_begin(&resv->seq);
- RCU_INIT_POINTER(resv->fence, new);
- write_seqcount_end(&resv->seq);
- preempt_enable();
-
- /* Drop the references to the removed fences or move them to ef_list */
- for (i = j, k = 0; i < old->shared_count; ++i) {
- struct dma_fence *f;
-
- f = rcu_dereference_protected(new->shared[i],
- reservation_object_held(resv));
- if (!ef)
- (*ef_list)[k++] = to_amdgpu_amdkfd_fence(f);
- else
- dma_fence_put(f);
- }
- kfree_rcu(old, rcu);
-
+ replacement = dma_fence_get_stub();
+ dma_resv_replace_fences(bo->tbo.base.resv, ef->base.context,
+ replacement, DMA_RESV_USAGE_BOOKKEEP);
+ dma_fence_put(replacement);
return 0;
}
-/* amdgpu_amdkfd_add_eviction_fence - Adds eviction fence(s) back into BO's
- * reservation object.
- *
- * @bo: [IN] Add eviction fences to this BO
- * @ef_list: [IN] List of eviction fences to be added
- * @ef_count: [IN] Number of fences in ef_list.
+/**
+ * amdgpu_amdkfd_remove_all_eviction_fences - Remove all eviction fences
+ * @bo: the BO where to remove the evictions fences from.
*
- * NOTE: Must call amdgpu_amdkfd_remove_eviction_fence before calling this
- * function.
+ * This functions should only be used on release when all references to the BO
+ * are already dropped. We remove the eviction fence from the private copy of
+ * the dma_resv object here since that is what is used during release to
+ * determine of the BO is idle or not.
*/
-static void amdgpu_amdkfd_add_eviction_fence(struct amdgpu_bo *bo,
- struct amdgpu_amdkfd_fence **ef_list,
- unsigned int ef_count)
+void amdgpu_amdkfd_remove_all_eviction_fences(struct amdgpu_bo *bo)
{
- int i;
+ struct dma_resv *resv = &bo->tbo.base._resv;
+ struct dma_fence *fence, *stub;
+ struct dma_resv_iter cursor;
- if (!ef_list || !ef_count)
- return;
+ dma_resv_assert_held(resv);
- for (i = 0; i < ef_count; i++) {
- amdgpu_bo_fence(bo, &ef_list[i]->base, true);
- /* Re-adding the fence takes an additional reference. Drop that
- * reference.
- */
- dma_fence_put(&ef_list[i]->base);
- }
+ stub = dma_fence_get_stub();
+ dma_resv_for_each_fence(&cursor, resv, DMA_RESV_USAGE_BOOKKEEP, fence) {
+ if (!to_amdgpu_amdkfd_fence(fence))
+ continue;
- kfree(ef_list);
+ dma_resv_replace_fences(resv, fence->context, stub,
+ DMA_RESV_USAGE_BOOKKEEP);
+ }
+ dma_fence_put(stub);
}
static int amdgpu_amdkfd_bo_validate(struct amdgpu_bo *bo, uint32_t domain,
@@ -341,33 +424,51 @@ static int amdgpu_amdkfd_bo_validate(struct amdgpu_bo *bo, uint32_t domain,
"Called with userptr BO"))
return -EINVAL;
+ /* bo has been pinned, not need validate it */
+ if (bo->tbo.pin_count)
+ return 0;
+
amdgpu_bo_placement_from_domain(bo, domain);
ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
if (ret)
goto validate_fail;
- if (wait) {
- struct amdgpu_amdkfd_fence **ef_list;
- unsigned int ef_count;
-
- ret = amdgpu_amdkfd_remove_eviction_fence(bo, NULL, &ef_list,
- &ef_count);
- if (ret)
- goto validate_fail;
-
- ttm_bo_wait(&bo->tbo, false, false);
- amdgpu_amdkfd_add_eviction_fence(bo, ef_list, ef_count);
- }
+ if (wait)
+ amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false);
validate_fail:
return ret;
}
-static int amdgpu_amdkfd_validate(void *param, struct amdgpu_bo *bo)
+int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo,
+ uint32_t domain,
+ struct dma_fence *fence)
{
- struct amdgpu_vm_parser *p = param;
+ int ret = amdgpu_bo_reserve(bo, false);
+
+ if (ret)
+ return ret;
+
+ ret = amdgpu_amdkfd_bo_validate(bo, domain, true);
+ if (ret)
+ goto unreserve_out;
+
+ ret = dma_resv_reserve_fences(bo->tbo.base.resv, 1);
+ if (ret)
+ goto unreserve_out;
- return amdgpu_amdkfd_bo_validate(bo, p->domain, p->wait);
+ dma_resv_add_fence(bo->tbo.base.resv, fence,
+ DMA_RESV_USAGE_BOOKKEEP);
+
+unreserve_out:
+ amdgpu_bo_unreserve(bo);
+
+ return ret;
+}
+
+static int amdgpu_amdkfd_validate_vm_bo(void *_unused, struct amdgpu_bo *bo)
+{
+ return amdgpu_amdkfd_bo_validate(bo, bo->allowed_domains, false);
}
/* vm_validate_pt_pd_bos - Validate page table and directory BOs
@@ -377,173 +478,564 @@ static int amdgpu_amdkfd_validate(void *param, struct amdgpu_bo *bo)
* again. Page directories are only updated after updating page
* tables.
*/
-static int vm_validate_pt_pd_bos(struct amdgpu_vm *vm)
+static int vm_validate_pt_pd_bos(struct amdgpu_vm *vm,
+ struct ww_acquire_ctx *ticket)
{
- struct amdgpu_bo *pd = vm->root.base.bo;
+ struct amdgpu_bo *pd = vm->root.bo;
struct amdgpu_device *adev = amdgpu_ttm_adev(pd->tbo.bdev);
- struct amdgpu_vm_parser param;
int ret;
- param.domain = AMDGPU_GEM_DOMAIN_VRAM;
- param.wait = false;
-
- ret = amdgpu_vm_validate_pt_bos(adev, vm, amdgpu_amdkfd_validate,
- &param);
+ ret = amdgpu_vm_validate(adev, vm, ticket,
+ amdgpu_amdkfd_validate_vm_bo, NULL);
if (ret) {
- pr_err("amdgpu: failed to validate PT BOs\n");
+ pr_err("failed to validate PT BOs\n");
return ret;
}
- ret = amdgpu_amdkfd_validate(&param, pd);
- if (ret) {
- pr_err("amdgpu: failed to validate PD\n");
+ vm->pd_phys_addr = amdgpu_gmc_pd_addr(vm->root.bo);
+
+ return 0;
+}
+
+static int vm_update_pds(struct amdgpu_vm *vm, struct amdgpu_sync *sync)
+{
+ struct amdgpu_bo *pd = vm->root.bo;
+ struct amdgpu_device *adev = amdgpu_ttm_adev(pd->tbo.bdev);
+ int ret;
+
+ ret = amdgpu_vm_update_pdes(adev, vm, false);
+ if (ret)
return ret;
+
+ return amdgpu_sync_fence(sync, vm->last_update, GFP_KERNEL);
+}
+
+static uint64_t get_pte_flags(struct amdgpu_device *adev, struct amdgpu_vm *vm,
+ struct kgd_mem *mem)
+{
+ uint32_t mapping_flags = AMDGPU_VM_PAGE_READABLE |
+ AMDGPU_VM_MTYPE_DEFAULT;
+
+ if (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE)
+ mapping_flags |= AMDGPU_VM_PAGE_WRITEABLE;
+ if (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE)
+ mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE;
+
+ return mapping_flags;
+}
+
+/**
+ * create_sg_table() - Create an sg_table for a contiguous DMA addr range
+ * @addr: The starting address to point to
+ * @size: Size of memory area in bytes being pointed to
+ *
+ * Allocates an instance of sg_table and initializes it to point to memory
+ * area specified by input parameters. The address used to build is assumed
+ * to be DMA mapped, if needed.
+ *
+ * DOORBELL or MMIO BOs use only one scatterlist node in their sg_table
+ * because they are physically contiguous.
+ *
+ * Return: Initialized instance of SG Table or NULL
+ */
+static struct sg_table *create_sg_table(uint64_t addr, uint32_t size)
+{
+ struct sg_table *sg = kmalloc(sizeof(*sg), GFP_KERNEL);
+
+ if (!sg)
+ return NULL;
+ if (sg_alloc_table(sg, 1, GFP_KERNEL)) {
+ kfree(sg);
+ return NULL;
}
+ sg_dma_address(sg->sgl) = addr;
+ sg->sgl->length = size;
+#ifdef CONFIG_NEED_SG_DMA_LENGTH
+ sg->sgl->dma_length = size;
+#endif
+ return sg;
+}
+
+static int
+kfd_mem_dmamap_userptr(struct kgd_mem *mem,
+ struct kfd_mem_attachment *attachment)
+{
+ enum dma_data_direction direction =
+ mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ?
+ DMA_BIDIRECTIONAL : DMA_TO_DEVICE;
+ struct ttm_operation_ctx ctx = {.interruptible = true};
+ struct amdgpu_bo *bo = attachment->bo_va->base.bo;
+ struct amdgpu_device *adev = attachment->adev;
+ struct ttm_tt *src_ttm = mem->bo->tbo.ttm;
+ struct ttm_tt *ttm = bo->tbo.ttm;
+ int ret;
- vm->pd_phys_addr = amdgpu_gmc_pd_addr(vm->root.base.bo);
+ if (WARN_ON(ttm->num_pages != src_ttm->num_pages))
+ return -EINVAL;
- if (vm->use_cpu_for_update) {
- ret = amdgpu_bo_kmap(pd, NULL);
- if (ret) {
- pr_err("amdgpu: failed to kmap PD, ret=%d\n", ret);
- return ret;
- }
+ ttm->sg = kmalloc(sizeof(*ttm->sg), GFP_KERNEL);
+ if (unlikely(!ttm->sg))
+ return -ENOMEM;
+
+ /* Same sequence as in amdgpu_ttm_tt_pin_userptr */
+ ret = sg_alloc_table_from_pages(ttm->sg, src_ttm->pages,
+ ttm->num_pages, 0,
+ (u64)ttm->num_pages << PAGE_SHIFT,
+ GFP_KERNEL);
+ if (unlikely(ret))
+ goto free_sg;
+
+ ret = dma_map_sgtable(adev->dev, ttm->sg, direction, 0);
+ if (unlikely(ret))
+ goto release_sg;
+
+ amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_GTT);
+ ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
+ if (ret)
+ goto unmap_sg;
+
+ return 0;
+
+unmap_sg:
+ dma_unmap_sgtable(adev->dev, ttm->sg, direction, 0);
+release_sg:
+ pr_err("DMA map userptr failed: %d\n", ret);
+ sg_free_table(ttm->sg);
+free_sg:
+ kfree(ttm->sg);
+ ttm->sg = NULL;
+ return ret;
+}
+
+static int
+kfd_mem_dmamap_dmabuf(struct kfd_mem_attachment *attachment)
+{
+ struct ttm_operation_ctx ctx = {.interruptible = true};
+ struct amdgpu_bo *bo = attachment->bo_va->base.bo;
+
+ amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_GTT);
+ return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
+}
+
+/**
+ * kfd_mem_dmamap_sg_bo() - Create DMA mapped sg_table to access DOORBELL or MMIO BO
+ * @mem: SG BO of the DOORBELL or MMIO resource on the owning device
+ * @attachment: Virtual address attachment of the BO on accessing device
+ *
+ * An access request from the device that owns DOORBELL does not require DMA mapping.
+ * This is because the request doesn't go through PCIe root complex i.e. it instead
+ * loops back. The need to DMA map arises only when accessing peer device's DOORBELL
+ *
+ * In contrast, all access requests for MMIO need to be DMA mapped without regard to
+ * device ownership. This is because access requests for MMIO go through PCIe root
+ * complex.
+ *
+ * This is accomplished in two steps:
+ * - Obtain DMA mapped address of DOORBELL or MMIO memory that could be used
+ * in updating requesting device's page table
+ * - Signal TTM to mark memory pointed to by requesting device's BO as GPU
+ * accessible. This allows an update of requesting device's page table
+ * with entries associated with DOOREBELL or MMIO memory
+ *
+ * This method is invoked in the following contexts:
+ * - Mapping of DOORBELL or MMIO BO of same or peer device
+ * - Validating an evicted DOOREBELL or MMIO BO on device seeking access
+ *
+ * Return: ZERO if successful, NON-ZERO otherwise
+ */
+static int
+kfd_mem_dmamap_sg_bo(struct kgd_mem *mem,
+ struct kfd_mem_attachment *attachment)
+{
+ struct ttm_operation_ctx ctx = {.interruptible = true};
+ struct amdgpu_bo *bo = attachment->bo_va->base.bo;
+ struct amdgpu_device *adev = attachment->adev;
+ struct ttm_tt *ttm = bo->tbo.ttm;
+ enum dma_data_direction dir;
+ dma_addr_t dma_addr;
+ bool mmio;
+ int ret;
+
+ /* Expect SG Table of dmapmap BO to be NULL */
+ mmio = (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP);
+ if (unlikely(ttm->sg)) {
+ pr_err("SG Table of %d BO for peer device is UNEXPECTEDLY NON-NULL", mmio);
+ return -EINVAL;
+ }
+
+ dir = mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ?
+ DMA_BIDIRECTIONAL : DMA_TO_DEVICE;
+ dma_addr = mem->bo->tbo.sg->sgl->dma_address;
+ pr_debug("%d BO size: %d\n", mmio, mem->bo->tbo.sg->sgl->length);
+ pr_debug("%d BO address before DMA mapping: %llx\n", mmio, dma_addr);
+ dma_addr = dma_map_resource(adev->dev, dma_addr,
+ mem->bo->tbo.sg->sgl->length, dir, DMA_ATTR_SKIP_CPU_SYNC);
+ ret = dma_mapping_error(adev->dev, dma_addr);
+ if (unlikely(ret))
+ return ret;
+ pr_debug("%d BO address after DMA mapping: %llx\n", mmio, dma_addr);
+
+ ttm->sg = create_sg_table(dma_addr, mem->bo->tbo.sg->sgl->length);
+ if (unlikely(!ttm->sg)) {
+ ret = -ENOMEM;
+ goto unmap_sg;
+ }
+
+ amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_GTT);
+ ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
+ if (unlikely(ret))
+ goto free_sg;
+
+ return ret;
+
+free_sg:
+ sg_free_table(ttm->sg);
+ kfree(ttm->sg);
+ ttm->sg = NULL;
+unmap_sg:
+ dma_unmap_resource(adev->dev, dma_addr, mem->bo->tbo.sg->sgl->length,
+ dir, DMA_ATTR_SKIP_CPU_SYNC);
+ return ret;
+}
+
+static int
+kfd_mem_dmamap_attachment(struct kgd_mem *mem,
+ struct kfd_mem_attachment *attachment)
+{
+ switch (attachment->type) {
+ case KFD_MEM_ATT_SHARED:
+ return 0;
+ case KFD_MEM_ATT_USERPTR:
+ return kfd_mem_dmamap_userptr(mem, attachment);
+ case KFD_MEM_ATT_DMABUF:
+ return kfd_mem_dmamap_dmabuf(attachment);
+ case KFD_MEM_ATT_SG:
+ return kfd_mem_dmamap_sg_bo(mem, attachment);
+ default:
+ WARN_ON_ONCE(1);
+ }
+ return -EINVAL;
+}
+
+static void
+kfd_mem_dmaunmap_userptr(struct kgd_mem *mem,
+ struct kfd_mem_attachment *attachment)
+{
+ enum dma_data_direction direction =
+ mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ?
+ DMA_BIDIRECTIONAL : DMA_TO_DEVICE;
+ struct ttm_operation_ctx ctx = {.interruptible = false};
+ struct amdgpu_bo *bo = attachment->bo_va->base.bo;
+ struct amdgpu_device *adev = attachment->adev;
+ struct ttm_tt *ttm = bo->tbo.ttm;
+
+ if (unlikely(!ttm->sg))
+ return;
+
+ amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU);
+ (void)ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
+
+ dma_unmap_sgtable(adev->dev, ttm->sg, direction, 0);
+ sg_free_table(ttm->sg);
+ kfree(ttm->sg);
+ ttm->sg = NULL;
+}
+
+static void
+kfd_mem_dmaunmap_dmabuf(struct kfd_mem_attachment *attachment)
+{
+ /* This is a no-op. We don't want to trigger eviction fences when
+ * unmapping DMABufs. Therefore the invalidation (moving to system
+ * domain) is done in kfd_mem_dmamap_dmabuf.
+ */
+}
+
+/**
+ * kfd_mem_dmaunmap_sg_bo() - Free DMA mapped sg_table of DOORBELL or MMIO BO
+ * @mem: SG BO of the DOORBELL or MMIO resource on the owning device
+ * @attachment: Virtual address attachment of the BO on accessing device
+ *
+ * The method performs following steps:
+ * - Signal TTM to mark memory pointed to by BO as GPU inaccessible
+ * - Free SG Table that is used to encapsulate DMA mapped memory of
+ * peer device's DOORBELL or MMIO memory
+ *
+ * This method is invoked in the following contexts:
+ * UNMapping of DOORBELL or MMIO BO on a device having access to its memory
+ * Eviction of DOOREBELL or MMIO BO on device having access to its memory
+ *
+ * Return: void
+ */
+static void
+kfd_mem_dmaunmap_sg_bo(struct kgd_mem *mem,
+ struct kfd_mem_attachment *attachment)
+{
+ struct ttm_operation_ctx ctx = {.interruptible = true};
+ struct amdgpu_bo *bo = attachment->bo_va->base.bo;
+ struct amdgpu_device *adev = attachment->adev;
+ struct ttm_tt *ttm = bo->tbo.ttm;
+ enum dma_data_direction dir;
+
+ if (unlikely(!ttm->sg)) {
+ pr_debug("SG Table of BO is NULL");
+ return;
+ }
+
+ amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU);
+ (void)ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
+
+ dir = mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ?
+ DMA_BIDIRECTIONAL : DMA_TO_DEVICE;
+ dma_unmap_resource(adev->dev, ttm->sg->sgl->dma_address,
+ ttm->sg->sgl->length, dir, DMA_ATTR_SKIP_CPU_SYNC);
+ sg_free_table(ttm->sg);
+ kfree(ttm->sg);
+ ttm->sg = NULL;
+ bo->tbo.sg = NULL;
+}
+
+static void
+kfd_mem_dmaunmap_attachment(struct kgd_mem *mem,
+ struct kfd_mem_attachment *attachment)
+{
+ switch (attachment->type) {
+ case KFD_MEM_ATT_SHARED:
+ break;
+ case KFD_MEM_ATT_USERPTR:
+ kfd_mem_dmaunmap_userptr(mem, attachment);
+ break;
+ case KFD_MEM_ATT_DMABUF:
+ kfd_mem_dmaunmap_dmabuf(attachment);
+ break;
+ case KFD_MEM_ATT_SG:
+ kfd_mem_dmaunmap_sg_bo(mem, attachment);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+}
+
+static int kfd_mem_export_dmabuf(struct kgd_mem *mem)
+{
+ if (!mem->dmabuf) {
+ struct amdgpu_device *bo_adev;
+ struct dma_buf *dmabuf;
+
+ bo_adev = amdgpu_ttm_adev(mem->bo->tbo.bdev);
+ dmabuf = drm_gem_prime_handle_to_dmabuf(&bo_adev->ddev, bo_adev->kfd.client.file,
+ mem->gem_handle,
+ mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ?
+ DRM_RDWR : 0);
+ if (IS_ERR(dmabuf))
+ return PTR_ERR(dmabuf);
+ mem->dmabuf = dmabuf;
}
return 0;
}
-static int vm_update_pds(struct amdgpu_vm *vm, struct amdgpu_sync *sync)
+static int
+kfd_mem_attach_dmabuf(struct amdgpu_device *adev, struct kgd_mem *mem,
+ struct amdgpu_bo **bo)
{
- struct amdgpu_bo *pd = vm->root.base.bo;
- struct amdgpu_device *adev = amdgpu_ttm_adev(pd->tbo.bdev);
+ struct drm_gem_object *gobj;
int ret;
- ret = amdgpu_vm_update_directories(adev, vm);
+ ret = kfd_mem_export_dmabuf(mem);
if (ret)
return ret;
- return amdgpu_sync_fence(NULL, sync, vm->last_update, false);
+ gobj = amdgpu_gem_prime_import(adev_to_drm(adev), mem->dmabuf);
+ if (IS_ERR(gobj))
+ return PTR_ERR(gobj);
+
+ *bo = gem_to_amdgpu_bo(gobj);
+ (*bo)->flags |= AMDGPU_GEM_CREATE_PREEMPTIBLE;
+
+ return 0;
}
-/* add_bo_to_vm - Add a BO to a VM
+/* kfd_mem_attach - Add a BO to a VM
*
* Everything that needs to bo done only once when a BO is first added
* to a VM. It can later be mapped and unmapped many times without
* repeating these steps.
*
+ * 0. Create BO for DMA mapping, if needed
* 1. Allocate and initialize BO VA entry data structure
* 2. Add BO to the VM
* 3. Determine ASIC-specific PTE flags
* 4. Alloc page tables and directories if needed
* 4a. Validate new page tables and directories
*/
-static int add_bo_to_vm(struct amdgpu_device *adev, struct kgd_mem *mem,
- struct amdgpu_vm *vm, bool is_aql,
- struct kfd_bo_va_list **p_bo_va_entry)
+static int kfd_mem_attach(struct amdgpu_device *adev, struct kgd_mem *mem,
+ struct amdgpu_vm *vm, bool is_aql)
{
- int ret;
- struct kfd_bo_va_list *bo_va_entry;
- struct amdgpu_bo *pd = vm->root.base.bo;
- struct amdgpu_bo *bo = mem->bo;
+ struct amdgpu_device *bo_adev = amdgpu_ttm_adev(mem->bo->tbo.bdev);
+ unsigned long bo_size = mem->bo->tbo.base.size;
uint64_t va = mem->va;
- struct list_head *list_bo_va = &mem->bo_va_list;
- unsigned long bo_size = bo->tbo.mem.size;
+ struct kfd_mem_attachment *attachment[2] = {NULL, NULL};
+ struct amdgpu_bo *bo[2] = {NULL, NULL};
+ struct amdgpu_bo_va *bo_va;
+ bool same_hive = false;
+ int i, ret;
if (!va) {
pr_err("Invalid VA when adding BO to VM\n");
return -EINVAL;
}
- if (is_aql)
- va += bo_size;
-
- bo_va_entry = kzalloc(sizeof(*bo_va_entry), GFP_KERNEL);
- if (!bo_va_entry)
- return -ENOMEM;
-
- pr_debug("\t add VA 0x%llx - 0x%llx to vm %p\n", va,
- va + bo_size, vm);
-
- /* Add BO to VM internal data structures*/
- bo_va_entry->bo_va = amdgpu_vm_bo_add(adev, vm, bo);
- if (!bo_va_entry->bo_va) {
- ret = -EINVAL;
- pr_err("Failed to add BO object to VM. ret == %d\n",
- ret);
- goto err_vmadd;
+ /* Determine access to VRAM, MMIO and DOORBELL BOs of peer devices
+ *
+ * The access path of MMIO and DOORBELL BOs of is always over PCIe.
+ * In contrast the access path of VRAM BOs depens upon the type of
+ * link that connects the peer device. Access over PCIe is allowed
+ * if peer device has large BAR. In contrast, access over xGMI is
+ * allowed for both small and large BAR configurations of peer device
+ */
+ if ((adev != bo_adev && !adev->apu_prefer_gtt) &&
+ ((mem->domain == AMDGPU_GEM_DOMAIN_VRAM) ||
+ (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) ||
+ (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP))) {
+ if (mem->domain == AMDGPU_GEM_DOMAIN_VRAM)
+ same_hive = amdgpu_xgmi_same_hive(adev, bo_adev);
+ if (!same_hive && !amdgpu_device_is_peer_accessible(bo_adev, adev))
+ return -EINVAL;
}
- bo_va_entry->va = va;
- bo_va_entry->pte_flags = amdgpu_gmc_get_pte_flags(adev,
- mem->mapping_flags);
- bo_va_entry->kgd_dev = (void *)adev;
- list_add(&bo_va_entry->bo_list, list_bo_va);
+ for (i = 0; i <= is_aql; i++) {
+ attachment[i] = kzalloc(sizeof(*attachment[i]), GFP_KERNEL);
+ if (unlikely(!attachment[i])) {
+ ret = -ENOMEM;
+ goto unwind;
+ }
- if (p_bo_va_entry)
- *p_bo_va_entry = bo_va_entry;
+ pr_debug("\t add VA 0x%llx - 0x%llx to vm %p\n", va,
+ va + bo_size, vm);
- /* Allocate new page tables if needed and validate
- * them. Clearing of new page tables and validate need to wait
- * on move fences. We don't want that to trigger the eviction
- * fence, so remove it temporarily.
- */
- amdgpu_amdkfd_remove_eviction_fence(pd,
- vm->process_info->eviction_fence,
- NULL, NULL);
+ if ((adev == bo_adev && !(mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)) ||
+ (amdgpu_ttm_tt_get_usermm(mem->bo->tbo.ttm) && reuse_dmamap(adev, bo_adev)) ||
+ (mem->domain == AMDGPU_GEM_DOMAIN_GTT && reuse_dmamap(adev, bo_adev)) ||
+ same_hive) {
+ /* Mappings on the local GPU, or VRAM mappings in the
+ * local hive, or userptr, or GTT mapping can reuse dma map
+ * address space share the original BO
+ */
+ attachment[i]->type = KFD_MEM_ATT_SHARED;
+ bo[i] = mem->bo;
+ drm_gem_object_get(&bo[i]->tbo.base);
+ } else if (i > 0) {
+ /* Multiple mappings on the same GPU share the BO */
+ attachment[i]->type = KFD_MEM_ATT_SHARED;
+ bo[i] = bo[0];
+ drm_gem_object_get(&bo[i]->tbo.base);
+ } else if (amdgpu_ttm_tt_get_usermm(mem->bo->tbo.ttm)) {
+ /* Create an SG BO to DMA-map userptrs on other GPUs */
+ attachment[i]->type = KFD_MEM_ATT_USERPTR;
+ ret = create_dmamap_sg_bo(adev, mem, &bo[i]);
+ if (ret)
+ goto unwind;
+ /* Handle DOORBELL BOs of peer devices and MMIO BOs of local and peer devices */
+ } else if (mem->bo->tbo.type == ttm_bo_type_sg) {
+ WARN_ONCE(!(mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL ||
+ mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP),
+ "Handing invalid SG BO in ATTACH request");
+ attachment[i]->type = KFD_MEM_ATT_SG;
+ ret = create_dmamap_sg_bo(adev, mem, &bo[i]);
+ if (ret)
+ goto unwind;
+ /* Enable acces to GTT and VRAM BOs of peer devices */
+ } else if (mem->domain == AMDGPU_GEM_DOMAIN_GTT ||
+ mem->domain == AMDGPU_GEM_DOMAIN_VRAM) {
+ attachment[i]->type = KFD_MEM_ATT_DMABUF;
+ ret = kfd_mem_attach_dmabuf(adev, mem, &bo[i]);
+ if (ret)
+ goto unwind;
+ pr_debug("Employ DMABUF mechanism to enable peer GPU access\n");
+ } else {
+ WARN_ONCE(true, "Handling invalid ATTACH request");
+ ret = -EINVAL;
+ goto unwind;
+ }
- ret = amdgpu_vm_alloc_pts(adev, vm, va, amdgpu_bo_size(bo));
- if (ret) {
- pr_err("Failed to allocate pts, err=%d\n", ret);
- goto err_alloc_pts;
- }
+ /* Add BO to VM internal data structures */
+ ret = amdgpu_bo_reserve(bo[i], false);
+ if (ret) {
+ pr_debug("Unable to reserve BO during memory attach");
+ goto unwind;
+ }
+ bo_va = amdgpu_vm_bo_find(vm, bo[i]);
+ if (!bo_va)
+ bo_va = amdgpu_vm_bo_add(adev, vm, bo[i]);
+ else
+ ++bo_va->ref_count;
+ attachment[i]->bo_va = bo_va;
+ amdgpu_bo_unreserve(bo[i]);
+ if (unlikely(!attachment[i]->bo_va)) {
+ ret = -ENOMEM;
+ pr_err("Failed to add BO object to VM. ret == %d\n",
+ ret);
+ goto unwind;
+ }
+ attachment[i]->va = va;
+ attachment[i]->pte_flags = get_pte_flags(adev, vm, mem);
+ attachment[i]->adev = adev;
+ list_add(&attachment[i]->list, &mem->attachments);
- ret = vm_validate_pt_pd_bos(vm);
- if (ret) {
- pr_err("validate_pt_pd_bos() failed\n");
- goto err_alloc_pts;
+ va += bo_size;
}
- /* Add the eviction fence back */
- amdgpu_bo_fence(pd, &vm->process_info->eviction_fence->base, true);
-
return 0;
-err_alloc_pts:
- amdgpu_bo_fence(pd, &vm->process_info->eviction_fence->base, true);
- amdgpu_vm_bo_rmv(adev, bo_va_entry->bo_va);
- list_del(&bo_va_entry->bo_list);
-err_vmadd:
- kfree(bo_va_entry);
+unwind:
+ for (; i >= 0; i--) {
+ if (!attachment[i])
+ continue;
+ if (attachment[i]->bo_va) {
+ (void)amdgpu_bo_reserve(bo[i], true);
+ if (--attachment[i]->bo_va->ref_count == 0)
+ amdgpu_vm_bo_del(adev, attachment[i]->bo_va);
+ amdgpu_bo_unreserve(bo[i]);
+ list_del(&attachment[i]->list);
+ }
+ if (bo[i])
+ drm_gem_object_put(&bo[i]->tbo.base);
+ kfree(attachment[i]);
+ }
return ret;
}
-static void remove_bo_from_vm(struct amdgpu_device *adev,
- struct kfd_bo_va_list *entry, unsigned long size)
+static void kfd_mem_detach(struct kfd_mem_attachment *attachment)
{
- pr_debug("\t remove VA 0x%llx - 0x%llx in entry %p\n",
- entry->va,
- entry->va + size, entry);
- amdgpu_vm_bo_rmv(adev, entry->bo_va);
- list_del(&entry->bo_list);
- kfree(entry);
+ struct amdgpu_bo *bo = attachment->bo_va->base.bo;
+
+ pr_debug("\t remove VA 0x%llx in entry %p\n",
+ attachment->va, attachment);
+ if (--attachment->bo_va->ref_count == 0)
+ amdgpu_vm_bo_del(attachment->adev, attachment->bo_va);
+ drm_gem_object_put(&bo->tbo.base);
+ list_del(&attachment->list);
+ kfree(attachment);
}
static void add_kgd_mem_to_kfd_bo_list(struct kgd_mem *mem,
struct amdkfd_process_info *process_info,
bool userptr)
{
- struct ttm_validate_buffer *entry = &mem->validate_list;
- struct amdgpu_bo *bo = mem->bo;
-
- INIT_LIST_HEAD(&entry->head);
- entry->num_shared = 1;
- entry->bo = &bo->tbo;
mutex_lock(&process_info->lock);
if (userptr)
- list_add_tail(&entry->head, &process_info->userptr_valid_list);
+ list_add_tail(&mem->validate_list,
+ &process_info->userptr_valid_list);
else
- list_add_tail(&entry->head, &process_info->kfd_bo_list);
+ list_add_tail(&mem->validate_list, &process_info->kfd_bo_list);
+ mutex_unlock(&process_info->lock);
+}
+
+static void remove_kgd_mem_from_kfd_bo_list(struct kgd_mem *mem,
+ struct amdkfd_process_info *process_info)
+{
+ mutex_lock(&process_info->lock);
+ list_del(&mem->validate_list);
mutex_unlock(&process_info->lock);
}
@@ -559,56 +1051,68 @@ static void add_kgd_mem_to_kfd_bo_list(struct kgd_mem *mem,
*
* Returns 0 for success, negative errno for errors.
*/
-static int init_user_pages(struct kgd_mem *mem, struct mm_struct *mm,
- uint64_t user_addr)
+static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr,
+ bool criu_resume)
{
struct amdkfd_process_info *process_info = mem->process_info;
struct amdgpu_bo *bo = mem->bo;
struct ttm_operation_ctx ctx = { true, false };
+ struct amdgpu_hmm_range *range;
int ret = 0;
mutex_lock(&process_info->lock);
- ret = amdgpu_ttm_tt_set_userptr(bo->tbo.ttm, user_addr, 0);
+ ret = amdgpu_ttm_tt_set_userptr(&bo->tbo, user_addr, 0);
if (ret) {
pr_err("%s: Failed to set userptr: %d\n", __func__, ret);
goto out;
}
- ret = amdgpu_mn_register(bo, user_addr);
+ ret = amdgpu_hmm_register(bo, user_addr);
if (ret) {
pr_err("%s: Failed to register MMU notifier: %d\n",
__func__, ret);
goto out;
}
- /* If no restore worker is running concurrently, user_pages
- * should not be allocated
- */
- WARN(mem->user_pages, "Leaking user_pages array");
+ if (criu_resume) {
+ /*
+ * During a CRIU restore operation, the userptr buffer objects
+ * will be validated in the restore_userptr_work worker at a
+ * later stage when it is scheduled by another ioctl called by
+ * CRIU master process for the target pid for restore.
+ */
+ mutex_lock(&process_info->notifier_lock);
+ mem->invalid++;
+ mutex_unlock(&process_info->notifier_lock);
+ mutex_unlock(&process_info->lock);
+ return 0;
+ }
- mem->user_pages = kvmalloc_array(bo->tbo.ttm->num_pages,
- sizeof(struct page *),
- GFP_KERNEL | __GFP_ZERO);
- if (!mem->user_pages) {
- pr_err("%s: Failed to allocate pages array\n", __func__);
+ range = amdgpu_hmm_range_alloc(NULL);
+ if (unlikely(!range)) {
ret = -ENOMEM;
goto unregister_out;
}
- ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm, mem->user_pages);
+ ret = amdgpu_ttm_tt_get_user_pages(bo, range);
if (ret) {
- pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
- goto free_out;
+ amdgpu_hmm_range_free(range);
+ if (ret == -EAGAIN)
+ pr_debug("Failed to get user pages, try again\n");
+ else
+ pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
+ goto unregister_out;
}
- amdgpu_ttm_tt_set_user_pages(bo->tbo.ttm, mem->user_pages);
-
ret = amdgpu_bo_reserve(bo, true);
if (ret) {
pr_err("%s: Failed to reserve BO\n", __func__);
goto release_out;
}
+
+ amdgpu_ttm_tt_set_user_pages(bo->tbo.ttm, range);
+
amdgpu_bo_placement_from_domain(bo, mem->domain);
ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
if (ret)
@@ -616,14 +1120,10 @@ static int init_user_pages(struct kgd_mem *mem, struct mm_struct *mm,
amdgpu_bo_unreserve(bo);
release_out:
- if (ret)
- release_pages(mem->user_pages, bo->tbo.ttm->num_pages);
-free_out:
- kvfree(mem->user_pages);
- mem->user_pages = NULL;
+ amdgpu_hmm_range_free(range);
unregister_out:
if (ret)
- amdgpu_mn_unregister(bo);
+ amdgpu_hmm_unregister(bo);
out:
mutex_unlock(&process_info->lock);
return ret;
@@ -635,13 +1135,12 @@ out:
* object can track VM updates.
*/
struct bo_vm_reservation_context {
- struct amdgpu_bo_list_entry kfd_bo; /* BO list entry for the KFD BO */
- unsigned int n_vms; /* Number of VMs reserved */
- struct amdgpu_bo_list_entry *vm_pd; /* Array of VM BO list entries */
- struct ww_acquire_ctx ticket; /* Reservation ticket */
- struct list_head list, duplicates; /* BO lists */
- struct amdgpu_sync *sync; /* Pointer to sync object */
- bool reserved; /* Whether BOs are reserved */
+ /* DRM execution context for the reservation */
+ struct drm_exec exec;
+ /* Number of VMs reserved */
+ unsigned int n_vms;
+ /* Pointer to sync object */
+ struct amdgpu_sync *sync;
};
enum bo_vm_match {
@@ -665,35 +1164,25 @@ static int reserve_bo_and_vm(struct kgd_mem *mem,
WARN_ON(!vm);
- ctx->reserved = false;
ctx->n_vms = 1;
ctx->sync = &mem->sync;
+ drm_exec_init(&ctx->exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0);
+ drm_exec_until_all_locked(&ctx->exec) {
+ ret = amdgpu_vm_lock_pd(vm, &ctx->exec, 2);
+ drm_exec_retry_on_contention(&ctx->exec);
+ if (unlikely(ret))
+ goto error;
- INIT_LIST_HEAD(&ctx->list);
- INIT_LIST_HEAD(&ctx->duplicates);
-
- ctx->vm_pd = kcalloc(ctx->n_vms, sizeof(*ctx->vm_pd), GFP_KERNEL);
- if (!ctx->vm_pd)
- return -ENOMEM;
-
- ctx->kfd_bo.priority = 0;
- ctx->kfd_bo.tv.bo = &bo->tbo;
- ctx->kfd_bo.tv.num_shared = 1;
- ctx->kfd_bo.user_pages = NULL;
- list_add(&ctx->kfd_bo.tv.head, &ctx->list);
-
- amdgpu_vm_get_pd_bo(vm, &ctx->list, &ctx->vm_pd[0]);
-
- ret = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->list,
- false, &ctx->duplicates);
- if (!ret)
- ctx->reserved = true;
- else {
- pr_err("Failed to reserve buffers in ttm\n");
- kfree(ctx->vm_pd);
- ctx->vm_pd = NULL;
+ ret = drm_exec_prepare_obj(&ctx->exec, &bo->tbo.base, 1);
+ drm_exec_retry_on_contention(&ctx->exec);
+ if (unlikely(ret))
+ goto error;
}
+ return 0;
+error:
+ pr_err("Failed to reserve buffers in ttm.\n");
+ drm_exec_fini(&ctx->exec);
return ret;
}
@@ -711,65 +1200,39 @@ static int reserve_bo_and_cond_vms(struct kgd_mem *mem,
struct amdgpu_vm *vm, enum bo_vm_match map_type,
struct bo_vm_reservation_context *ctx)
{
+ struct kfd_mem_attachment *entry;
struct amdgpu_bo *bo = mem->bo;
- struct kfd_bo_va_list *entry;
- unsigned int i;
int ret;
- ctx->reserved = false;
- ctx->n_vms = 0;
- ctx->vm_pd = NULL;
ctx->sync = &mem->sync;
+ drm_exec_init(&ctx->exec, DRM_EXEC_INTERRUPTIBLE_WAIT |
+ DRM_EXEC_IGNORE_DUPLICATES, 0);
+ drm_exec_until_all_locked(&ctx->exec) {
+ ctx->n_vms = 0;
+ list_for_each_entry(entry, &mem->attachments, list) {
+ if ((vm && vm != entry->bo_va->base.vm) ||
+ (entry->is_mapped != map_type
+ && map_type != BO_VM_ALL))
+ continue;
- INIT_LIST_HEAD(&ctx->list);
- INIT_LIST_HEAD(&ctx->duplicates);
-
- list_for_each_entry(entry, &mem->bo_va_list, bo_list) {
- if ((vm && vm != entry->bo_va->base.vm) ||
- (entry->is_mapped != map_type
- && map_type != BO_VM_ALL))
- continue;
-
- ctx->n_vms++;
- }
-
- if (ctx->n_vms != 0) {
- ctx->vm_pd = kcalloc(ctx->n_vms, sizeof(*ctx->vm_pd),
- GFP_KERNEL);
- if (!ctx->vm_pd)
- return -ENOMEM;
- }
-
- ctx->kfd_bo.priority = 0;
- ctx->kfd_bo.tv.bo = &bo->tbo;
- ctx->kfd_bo.tv.num_shared = 1;
- ctx->kfd_bo.user_pages = NULL;
- list_add(&ctx->kfd_bo.tv.head, &ctx->list);
-
- i = 0;
- list_for_each_entry(entry, &mem->bo_va_list, bo_list) {
- if ((vm && vm != entry->bo_va->base.vm) ||
- (entry->is_mapped != map_type
- && map_type != BO_VM_ALL))
- continue;
-
- amdgpu_vm_get_pd_bo(entry->bo_va->base.vm, &ctx->list,
- &ctx->vm_pd[i]);
- i++;
- }
-
- ret = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->list,
- false, &ctx->duplicates);
- if (!ret)
- ctx->reserved = true;
- else
- pr_err("Failed to reserve buffers in ttm.\n");
+ ret = amdgpu_vm_lock_pd(entry->bo_va->base.vm,
+ &ctx->exec, 2);
+ drm_exec_retry_on_contention(&ctx->exec);
+ if (unlikely(ret))
+ goto error;
+ ++ctx->n_vms;
+ }
- if (ret) {
- kfree(ctx->vm_pd);
- ctx->vm_pd = NULL;
+ ret = drm_exec_prepare_obj(&ctx->exec, &bo->tbo.base, 1);
+ drm_exec_retry_on_contention(&ctx->exec);
+ if (unlikely(ret))
+ goto error;
}
+ return 0;
+error:
+ pr_err("Failed to reserve buffers in ttm.\n");
+ drm_exec_fini(&ctx->exec);
return ret;
}
@@ -791,59 +1254,48 @@ static int unreserve_bo_and_vms(struct bo_vm_reservation_context *ctx,
if (wait)
ret = amdgpu_sync_wait(ctx->sync, intr);
- if (ctx->reserved)
- ttm_eu_backoff_reservation(&ctx->ticket, &ctx->list);
- kfree(ctx->vm_pd);
-
+ drm_exec_fini(&ctx->exec);
ctx->sync = NULL;
-
- ctx->reserved = false;
- ctx->vm_pd = NULL;
-
return ret;
}
-static int unmap_bo_from_gpuvm(struct amdgpu_device *adev,
- struct kfd_bo_va_list *entry,
+static int unmap_bo_from_gpuvm(struct kgd_mem *mem,
+ struct kfd_mem_attachment *entry,
struct amdgpu_sync *sync)
{
struct amdgpu_bo_va *bo_va = entry->bo_va;
+ struct amdgpu_device *adev = entry->adev;
struct amdgpu_vm *vm = bo_va->base.vm;
- struct amdgpu_bo *pd = vm->root.base.bo;
- /* Remove eviction fence from PD (and thereby from PTs too as
- * they share the resv. object). Otherwise during PT update
- * job (see amdgpu_vm_bo_update_mapping), eviction fence would
- * get added to job->sync object and job execution would
- * trigger the eviction fence.
- */
- amdgpu_amdkfd_remove_eviction_fence(pd,
- vm->process_info->eviction_fence,
- NULL, NULL);
- amdgpu_vm_bo_unmap(adev, bo_va, entry->va);
+ if (bo_va->queue_refcount) {
+ pr_debug("bo_va->queue_refcount %d\n", bo_va->queue_refcount);
+ return -EBUSY;
+ }
- amdgpu_vm_clear_freed(adev, vm, &bo_va->last_pt_update);
+ (void)amdgpu_vm_bo_unmap(adev, bo_va, entry->va);
- /* Add the eviction fence back */
- amdgpu_bo_fence(pd, &vm->process_info->eviction_fence->base, true);
+ /* VM entity stopped if process killed, don't clear freed pt bo */
+ if (!amdgpu_vm_ready(vm))
+ return 0;
- amdgpu_sync_fence(NULL, sync, bo_va->last_pt_update, false);
+ (void)amdgpu_vm_clear_freed(adev, vm, &bo_va->last_pt_update);
+
+ (void)amdgpu_sync_fence(sync, bo_va->last_pt_update, GFP_KERNEL);
return 0;
}
-static int update_gpuvm_pte(struct amdgpu_device *adev,
- struct kfd_bo_va_list *entry,
- struct amdgpu_sync *sync)
+static int update_gpuvm_pte(struct kgd_mem *mem,
+ struct kfd_mem_attachment *entry,
+ struct amdgpu_sync *sync)
{
+ struct amdgpu_bo_va *bo_va = entry->bo_va;
+ struct amdgpu_device *adev = entry->adev;
int ret;
- struct amdgpu_vm *vm;
- struct amdgpu_bo_va *bo_va;
- struct amdgpu_bo *bo;
- bo_va = entry->bo_va;
- vm = bo_va->base.vm;
- bo = bo_va->base.bo;
+ ret = kfd_mem_dmamap_attachment(mem, entry);
+ if (ret)
+ return ret;
/* Update the page tables */
ret = amdgpu_vm_bo_update(adev, bo_va, false);
@@ -852,17 +1304,18 @@ static int update_gpuvm_pte(struct amdgpu_device *adev,
return ret;
}
- return amdgpu_sync_fence(NULL, sync, bo_va->last_pt_update, false);
+ return amdgpu_sync_fence(sync, bo_va->last_pt_update, GFP_KERNEL);
}
-static int map_bo_to_gpuvm(struct amdgpu_device *adev,
- struct kfd_bo_va_list *entry, struct amdgpu_sync *sync,
- bool no_update_pte)
+static int map_bo_to_gpuvm(struct kgd_mem *mem,
+ struct kfd_mem_attachment *entry,
+ struct amdgpu_sync *sync,
+ bool no_update_pte)
{
int ret;
/* Set virtual address for the allocation */
- ret = amdgpu_vm_bo_map(adev, entry->bo_va, entry->va, 0,
+ ret = amdgpu_vm_bo_map(entry->adev, entry->bo_va, entry->va, 0,
amdgpu_bo_size(entry->bo_va->base.bo),
entry->pte_flags);
if (ret) {
@@ -874,7 +1327,7 @@ static int map_bo_to_gpuvm(struct amdgpu_device *adev,
if (no_update_pte)
return 0;
- ret = update_gpuvm_pte(adev, entry, sync);
+ ret = update_gpuvm_pte(mem, entry, sync);
if (ret) {
pr_err("update_gpuvm_pte() failed\n");
goto update_gpuvm_pte_failed;
@@ -883,36 +1336,20 @@ static int map_bo_to_gpuvm(struct amdgpu_device *adev,
return 0;
update_gpuvm_pte_failed:
- unmap_bo_from_gpuvm(adev, entry, sync);
+ unmap_bo_from_gpuvm(mem, entry, sync);
+ kfd_mem_dmaunmap_attachment(mem, entry);
return ret;
}
-static struct sg_table *create_doorbell_sg(uint64_t addr, uint32_t size)
-{
- struct sg_table *sg = kmalloc(sizeof(*sg), GFP_KERNEL);
-
- if (!sg)
- return NULL;
- if (sg_alloc_table(sg, 1, GFP_KERNEL)) {
- kfree(sg);
- return NULL;
- }
- sg->sgl->dma_address = addr;
- sg->sgl->length = size;
-#ifdef CONFIG_NEED_SG_DMA_LENGTH
- sg->sgl->dma_length = size;
-#endif
- return sg;
-}
-
-static int process_validate_vms(struct amdkfd_process_info *process_info)
+static int process_validate_vms(struct amdkfd_process_info *process_info,
+ struct ww_acquire_ctx *ticket)
{
struct amdgpu_vm *peer_vm;
int ret;
list_for_each_entry(peer_vm, &process_info->vm_list_head,
vm_list_node) {
- ret = vm_validate_pt_pd_bos(peer_vm);
+ ret = vm_validate_pt_pd_bos(peer_vm, ticket);
if (ret)
return ret;
}
@@ -928,11 +1365,11 @@ static int process_sync_pds_resv(struct amdkfd_process_info *process_info,
list_for_each_entry(peer_vm, &process_info->vm_list_head,
vm_list_node) {
- struct amdgpu_bo *pd = peer_vm->root.base.bo;
+ struct amdgpu_bo *pd = peer_vm->root.bo;
- ret = amdgpu_sync_resv(NULL,
- sync, pd->tbo.resv,
- AMDGPU_FENCE_OWNER_UNDEFINED, false);
+ ret = amdgpu_sync_resv(NULL, sync, pd->tbo.base.resv,
+ AMDGPU_SYNC_NE_OWNER,
+ AMDGPU_FENCE_OWNER_KFD);
if (ret)
return ret;
}
@@ -968,6 +1405,7 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void **process_info,
return -ENOMEM;
mutex_init(&info->lock);
+ mutex_init(&info->notifier_lock);
INIT_LIST_HEAD(&info->vm_list_head);
INIT_LIST_HEAD(&info->kfd_bo_list);
INIT_LIST_HEAD(&info->userptr_valid_list);
@@ -975,7 +1413,8 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void **process_info,
info->eviction_fence =
amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1),
- current->mm);
+ current->mm,
+ NULL);
if (!info->eviction_fence) {
pr_err("Failed to create eviction fence\n");
ret = -ENOMEM;
@@ -983,104 +1422,135 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void **process_info,
}
info->pid = get_task_pid(current->group_leader, PIDTYPE_PID);
- atomic_set(&info->evicted_bos, 0);
INIT_DELAYED_WORK(&info->restore_userptr_work,
amdgpu_amdkfd_restore_userptr_worker);
*process_info = info;
- *ef = dma_fence_get(&info->eviction_fence->base);
}
vm->process_info = *process_info;
/* Validate page directory and attach eviction fence */
- ret = amdgpu_bo_reserve(vm->root.base.bo, true);
+ ret = amdgpu_bo_reserve(vm->root.bo, true);
if (ret)
goto reserve_pd_fail;
- ret = vm_validate_pt_pd_bos(vm);
+ ret = vm_validate_pt_pd_bos(vm, NULL);
if (ret) {
pr_err("validate_pt_pd_bos() failed\n");
goto validate_pd_fail;
}
- ret = ttm_bo_wait(&vm->root.base.bo->tbo, false, false);
+ ret = amdgpu_bo_sync_wait(vm->root.bo,
+ AMDGPU_FENCE_OWNER_KFD, false);
if (ret)
goto wait_pd_fail;
- amdgpu_bo_fence(vm->root.base.bo,
- &vm->process_info->eviction_fence->base, true);
- amdgpu_bo_unreserve(vm->root.base.bo);
+ ret = dma_resv_reserve_fences(vm->root.bo->tbo.base.resv, 1);
+ if (ret)
+ goto reserve_shared_fail;
+ dma_resv_add_fence(vm->root.bo->tbo.base.resv,
+ &vm->process_info->eviction_fence->base,
+ DMA_RESV_USAGE_BOOKKEEP);
+ amdgpu_bo_unreserve(vm->root.bo);
/* Update process info */
mutex_lock(&vm->process_info->lock);
list_add_tail(&vm->vm_list_node,
&(vm->process_info->vm_list_head));
vm->process_info->n_vms++;
+ if (ef)
+ *ef = dma_fence_get(&vm->process_info->eviction_fence->base);
mutex_unlock(&vm->process_info->lock);
return 0;
+reserve_shared_fail:
wait_pd_fail:
validate_pd_fail:
- amdgpu_bo_unreserve(vm->root.base.bo);
+ amdgpu_bo_unreserve(vm->root.bo);
reserve_pd_fail:
vm->process_info = NULL;
if (info) {
- /* Two fence references: one in info and one in *ef */
dma_fence_put(&info->eviction_fence->base);
- dma_fence_put(*ef);
- *ef = NULL;
*process_info = NULL;
put_pid(info->pid);
create_evict_fence_fail:
mutex_destroy(&info->lock);
+ mutex_destroy(&info->notifier_lock);
kfree(info);
}
return ret;
}
-int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, unsigned int pasid,
- void **vm, void **process_info,
- struct dma_fence **ef)
+/**
+ * amdgpu_amdkfd_gpuvm_pin_bo() - Pins a BO using following criteria
+ * @bo: Handle of buffer object being pinned
+ * @domain: Domain into which BO should be pinned
+ *
+ * - USERPTR BOs are UNPINNABLE and will return error
+ * - All other BO types (GTT, VRAM, MMIO and DOORBELL) will have their
+ * PIN count incremented. It is valid to PIN a BO multiple times
+ *
+ * Return: ZERO if successful in pinning, Non-Zero in case of error.
+ */
+static int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo *bo, u32 domain)
{
- struct amdgpu_device *adev = get_amdgpu_device(kgd);
- struct amdgpu_vm *new_vm;
- int ret;
+ int ret = 0;
- new_vm = kzalloc(sizeof(*new_vm), GFP_KERNEL);
- if (!new_vm)
- return -ENOMEM;
+ ret = amdgpu_bo_reserve(bo, false);
+ if (unlikely(ret))
+ return ret;
- /* Initialize AMDGPU part of the VM */
- ret = amdgpu_vm_init(adev, new_vm, AMDGPU_VM_CONTEXT_COMPUTE, pasid);
- if (ret) {
- pr_err("Failed init vm ret %d\n", ret);
- goto amdgpu_vm_init_fail;
+ if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {
+ /*
+ * If bo is not contiguous on VRAM, move to system memory first to ensure
+ * we can get contiguous VRAM space after evicting other BOs.
+ */
+ if (!(bo->tbo.resource->placement & TTM_PL_FLAG_CONTIGUOUS)) {
+ struct ttm_operation_ctx ctx = { true, false };
+
+ amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_GTT);
+ ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
+ if (unlikely(ret)) {
+ pr_debug("validate bo 0x%p to GTT failed %d\n", &bo->tbo, ret);
+ goto out;
+ }
+ }
}
- /* Initialize KFD part of the VM and process info */
- ret = init_kfd_vm(new_vm, process_info, ef);
+ ret = amdgpu_bo_pin(bo, domain);
if (ret)
- goto init_kfd_vm_fail;
+ pr_err("Error in Pinning BO to domain: %d\n", domain);
+
+ amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false);
+out:
+ amdgpu_bo_unreserve(bo);
+ return ret;
+}
- *vm = (void *) new_vm;
+/**
+ * amdgpu_amdkfd_gpuvm_unpin_bo() - Unpins BO using following criteria
+ * @bo: Handle of buffer object being unpinned
+ *
+ * - Is a illegal request for USERPTR BOs and is ignored
+ * - All other BO types (GTT, VRAM, MMIO and DOORBELL) will have their
+ * PIN count decremented. Calls to UNPIN must balance calls to PIN
+ */
+static void amdgpu_amdkfd_gpuvm_unpin_bo(struct amdgpu_bo *bo)
+{
+ int ret = 0;
- return 0;
+ ret = amdgpu_bo_reserve(bo, false);
+ if (unlikely(ret))
+ return;
-init_kfd_vm_fail:
- amdgpu_vm_fini(adev, new_vm);
-amdgpu_vm_init_fail:
- kfree(new_vm);
- return ret;
+ amdgpu_bo_unpin(bo);
+ amdgpu_bo_unreserve(bo);
}
-int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct kgd_dev *kgd,
- struct file *filp, unsigned int pasid,
- void **vm, void **process_info,
+int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev,
+ struct amdgpu_vm *avm,
+ void **process_info,
struct dma_fence **ef)
{
- struct amdgpu_device *adev = get_amdgpu_device(kgd);
- struct drm_file *drm_priv = filp->private_data;
- struct amdgpu_fpriv *drv_priv = drm_priv->driver_priv;
- struct amdgpu_vm *avm = &drv_priv->vm;
int ret;
/* Already a compute VM? */
@@ -1088,7 +1558,7 @@ int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct kgd_dev *kgd,
return -EINVAL;
/* Convert VM into a compute VM */
- ret = amdgpu_vm_make_compute(adev, avm, pasid);
+ ret = amdgpu_vm_make_compute(adev, avm);
if (ret)
return ret;
@@ -1097,7 +1567,7 @@ int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct kgd_dev *kgd,
if (ret)
return ret;
- *vm = (void *)avm;
+ amdgpu_vm_set_task_info(avm);
return 0;
}
@@ -1106,22 +1576,18 @@ void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,
struct amdgpu_vm *vm)
{
struct amdkfd_process_info *process_info = vm->process_info;
- struct amdgpu_bo *pd = vm->root.base.bo;
if (!process_info)
return;
- /* Release eviction fence from PD */
- amdgpu_bo_reserve(pd, false);
- amdgpu_bo_fence(pd, NULL, false);
- amdgpu_bo_unreserve(pd);
-
/* Update process info */
mutex_lock(&process_info->lock);
process_info->n_vms--;
list_del(&vm->vm_list_node);
mutex_unlock(&process_info->lock);
+ vm->process_info = NULL;
+
/* Release per-process resources when last compute VM is destroyed */
if (!process_info->n_vms) {
WARN_ON(!list_empty(&process_info->kfd_bo_list));
@@ -1132,165 +1598,218 @@ void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,
cancel_delayed_work_sync(&process_info->restore_userptr_work);
put_pid(process_info->pid);
mutex_destroy(&process_info->lock);
+ mutex_destroy(&process_info->notifier_lock);
kfree(process_info);
}
}
-void amdgpu_amdkfd_gpuvm_destroy_process_vm(struct kgd_dev *kgd, void *vm)
+uint64_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *drm_priv)
{
- struct amdgpu_device *adev = get_amdgpu_device(kgd);
- struct amdgpu_vm *avm = (struct amdgpu_vm *)vm;
-
- if (WARN_ON(!kgd || !vm))
- return;
-
- pr_debug("Destroying process vm %p\n", vm);
+ struct amdgpu_vm *avm = drm_priv_to_vm(drm_priv);
+ struct amdgpu_bo *pd = avm->root.bo;
+ struct amdgpu_device *adev = amdgpu_ttm_adev(pd->tbo.bdev);
- /* Release the VM context */
- amdgpu_vm_fini(adev, avm);
- kfree(vm);
+ if (adev->asic_type < CHIP_VEGA10)
+ return avm->pd_phys_addr >> AMDGPU_GPU_PAGE_SHIFT;
+ return avm->pd_phys_addr;
}
-void amdgpu_amdkfd_gpuvm_release_process_vm(struct kgd_dev *kgd, void *vm)
+void amdgpu_amdkfd_block_mmu_notifications(void *p)
{
- struct amdgpu_device *adev = get_amdgpu_device(kgd);
- struct amdgpu_vm *avm = (struct amdgpu_vm *)vm;
+ struct amdkfd_process_info *pinfo = (struct amdkfd_process_info *)p;
- if (WARN_ON(!kgd || !vm))
- return;
+ mutex_lock(&pinfo->lock);
+ WRITE_ONCE(pinfo->block_mmu_notifications, true);
+ mutex_unlock(&pinfo->lock);
+}
- pr_debug("Releasing process vm %p\n", vm);
+int amdgpu_amdkfd_criu_resume(void *p)
+{
+ int ret = 0;
+ struct amdkfd_process_info *pinfo = (struct amdkfd_process_info *)p;
+
+ mutex_lock(&pinfo->lock);
+ pr_debug("scheduling work\n");
+ mutex_lock(&pinfo->notifier_lock);
+ pinfo->evicted_bos++;
+ mutex_unlock(&pinfo->notifier_lock);
+ if (!READ_ONCE(pinfo->block_mmu_notifications)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ WRITE_ONCE(pinfo->block_mmu_notifications, false);
+ queue_delayed_work(system_freezable_wq,
+ &pinfo->restore_userptr_work, 0);
- /* The original pasid of amdgpu vm has already been
- * released during making a amdgpu vm to a compute vm
- * The current pasid is managed by kfd and will be
- * released on kfd process destroy. Set amdgpu pasid
- * to 0 to avoid duplicate release.
- */
- amdgpu_vm_release_compute(adev, avm);
+out_unlock:
+ mutex_unlock(&pinfo->lock);
+ return ret;
}
-uint64_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *vm)
+size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev,
+ uint8_t xcp_id)
{
- struct amdgpu_vm *avm = (struct amdgpu_vm *)vm;
- struct amdgpu_bo *pd = avm->root.base.bo;
- struct amdgpu_device *adev = amdgpu_ttm_adev(pd->tbo.bdev);
+ uint64_t reserved_for_pt =
+ ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size);
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ uint64_t reserved_for_ras = (con ? con->reserved_pages_in_bytes : 0);
+ ssize_t available;
+ uint64_t vram_available, system_mem_available, ttm_mem_available;
- if (adev->asic_type < CHIP_VEGA10)
- return avm->pd_phys_addr >> AMDGPU_GPU_PAGE_SHIFT;
- return avm->pd_phys_addr;
+ spin_lock(&kfd_mem_limit.mem_limit_lock);
+ if (adev->apu_prefer_gtt && !adev->gmc.is_app_apu)
+ vram_available = KFD_XCP_MEMORY_SIZE(adev, xcp_id)
+ - adev->kfd.vram_used_aligned[xcp_id];
+ else
+ vram_available = KFD_XCP_MEMORY_SIZE(adev, xcp_id)
+ - adev->kfd.vram_used_aligned[xcp_id]
+ - atomic64_read(&adev->vram_pin_size)
+ - reserved_for_pt
+ - reserved_for_ras;
+
+ if (adev->apu_prefer_gtt) {
+ system_mem_available = no_system_mem_limit ?
+ kfd_mem_limit.max_system_mem_limit :
+ kfd_mem_limit.max_system_mem_limit -
+ kfd_mem_limit.system_mem_used;
+
+ ttm_mem_available = kfd_mem_limit.max_ttm_mem_limit -
+ kfd_mem_limit.ttm_mem_used;
+
+ available = min3(system_mem_available, ttm_mem_available,
+ vram_available);
+ available = ALIGN_DOWN(available, PAGE_SIZE);
+ } else {
+ available = ALIGN_DOWN(vram_available, VRAM_AVAILABLITY_ALIGN);
+ }
+
+ spin_unlock(&kfd_mem_limit.mem_limit_lock);
+
+ if (available < 0)
+ available = 0;
+
+ return available;
}
int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
- struct kgd_dev *kgd, uint64_t va, uint64_t size,
- void *vm, struct kgd_mem **mem,
- uint64_t *offset, uint32_t flags)
+ struct amdgpu_device *adev, uint64_t va, uint64_t size,
+ void *drm_priv, struct kgd_mem **mem,
+ uint64_t *offset, uint32_t flags, bool criu_resume)
{
- struct amdgpu_device *adev = get_amdgpu_device(kgd);
- struct amdgpu_vm *avm = (struct amdgpu_vm *)vm;
+ struct amdgpu_vm *avm = drm_priv_to_vm(drm_priv);
+ struct amdgpu_fpriv *fpriv = container_of(avm, struct amdgpu_fpriv, vm);
enum ttm_bo_type bo_type = ttm_bo_type_device;
struct sg_table *sg = NULL;
uint64_t user_addr = 0;
struct amdgpu_bo *bo;
- struct amdgpu_bo_param bp;
- int byte_align;
+ struct drm_gem_object *gobj = NULL;
u32 domain, alloc_domain;
+ uint64_t aligned_size;
+ int8_t xcp_id = -1;
u64 alloc_flags;
- uint32_t mapping_flags;
int ret;
/*
* Check on which domain to allocate BO
*/
- if (flags & ALLOC_MEM_FLAGS_VRAM) {
+ if (flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
domain = alloc_domain = AMDGPU_GEM_DOMAIN_VRAM;
- alloc_flags = AMDGPU_GEM_CREATE_VRAM_CLEARED;
- alloc_flags |= (flags & ALLOC_MEM_FLAGS_PUBLIC) ?
- AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED :
- AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
- } else if (flags & ALLOC_MEM_FLAGS_GTT) {
+
+ if (adev->apu_prefer_gtt) {
+ domain = AMDGPU_GEM_DOMAIN_GTT;
+ alloc_domain = AMDGPU_GEM_DOMAIN_GTT;
+ alloc_flags = 0;
+ } else {
+ alloc_flags = AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;
+ alloc_flags |= (flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) ?
+ AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0;
+
+ /* For contiguous VRAM allocation */
+ if (flags & KFD_IOC_ALLOC_MEM_FLAGS_CONTIGUOUS)
+ alloc_flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
+ }
+ xcp_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ?
+ 0 : fpriv->xcp_id;
+ } else if (flags & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {
domain = alloc_domain = AMDGPU_GEM_DOMAIN_GTT;
alloc_flags = 0;
- } else if (flags & ALLOC_MEM_FLAGS_USERPTR) {
- domain = AMDGPU_GEM_DOMAIN_GTT;
- alloc_domain = AMDGPU_GEM_DOMAIN_CPU;
- alloc_flags = 0;
- if (!offset || !*offset)
- return -EINVAL;
- user_addr = *offset;
- } else if (flags & ALLOC_MEM_FLAGS_DOORBELL) {
+ } else {
domain = AMDGPU_GEM_DOMAIN_GTT;
alloc_domain = AMDGPU_GEM_DOMAIN_CPU;
- bo_type = ttm_bo_type_sg;
- alloc_flags = 0;
- if (size > UINT_MAX)
+ alloc_flags = AMDGPU_GEM_CREATE_PREEMPTIBLE;
+
+ if (flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
+ if (!offset || !*offset)
+ return -EINVAL;
+ user_addr = untagged_addr(*offset);
+ } else if (flags & (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |
+ KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)) {
+ bo_type = ttm_bo_type_sg;
+ if (size > UINT_MAX)
+ return -EINVAL;
+ sg = create_sg_table(*offset, size);
+ if (!sg)
+ return -ENOMEM;
+ } else {
return -EINVAL;
- sg = create_doorbell_sg(*offset, size);
- if (!sg)
- return -ENOMEM;
- } else {
- return -EINVAL;
+ }
}
+ if (flags & KFD_IOC_ALLOC_MEM_FLAGS_COHERENT)
+ alloc_flags |= AMDGPU_GEM_CREATE_COHERENT;
+ if (flags & KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT)
+ alloc_flags |= AMDGPU_GEM_CREATE_EXT_COHERENT;
+ if (flags & KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED)
+ alloc_flags |= AMDGPU_GEM_CREATE_UNCACHED;
+
*mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL);
if (!*mem) {
ret = -ENOMEM;
goto err;
}
- INIT_LIST_HEAD(&(*mem)->bo_va_list);
+ INIT_LIST_HEAD(&(*mem)->attachments);
mutex_init(&(*mem)->lock);
- (*mem)->aql_queue = !!(flags & ALLOC_MEM_FLAGS_AQL_QUEUE_MEM);
+ (*mem)->aql_queue = !!(flags & KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM);
/* Workaround for AQL queue wraparound bug. Map the same
* memory twice. That means we only actually allocate half
* the memory.
*/
if ((*mem)->aql_queue)
- size = size >> 1;
-
- /* Workaround for TLB bug on older VI chips */
- byte_align = (adev->family == AMDGPU_FAMILY_VI &&
- adev->asic_type != CHIP_FIJI &&
- adev->asic_type != CHIP_POLARIS10 &&
- adev->asic_type != CHIP_POLARIS11 &&
- adev->asic_type != CHIP_POLARIS12) ?
- VI_BO_SIZE_ALIGN : 1;
-
- mapping_flags = AMDGPU_VM_PAGE_READABLE;
- if (flags & ALLOC_MEM_FLAGS_WRITABLE)
- mapping_flags |= AMDGPU_VM_PAGE_WRITEABLE;
- if (flags & ALLOC_MEM_FLAGS_EXECUTABLE)
- mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE;
- if (flags & ALLOC_MEM_FLAGS_COHERENT)
- mapping_flags |= AMDGPU_VM_MTYPE_UC;
- else
- mapping_flags |= AMDGPU_VM_MTYPE_NC;
- (*mem)->mapping_flags = mapping_flags;
+ size >>= 1;
+ aligned_size = PAGE_ALIGN(size);
+
+ (*mem)->alloc_flags = flags;
amdgpu_sync_create(&(*mem)->sync);
- ret = amdgpu_amdkfd_reserve_mem_limit(adev, size, alloc_domain, !!sg);
+ ret = amdgpu_amdkfd_reserve_mem_limit(adev, aligned_size, flags,
+ xcp_id);
if (ret) {
- pr_debug("Insufficient system memory\n");
+ pr_debug("Insufficient memory\n");
goto err_reserve_limit;
}
- pr_debug("\tcreate BO VA 0x%llx size 0x%llx domain %s\n",
- va, size, domain_string(alloc_domain));
+ pr_debug("\tcreate BO VA 0x%llx size 0x%llx domain %s xcp_id %d\n",
+ va, (*mem)->aql_queue ? size << 1 : size,
+ domain_string(alloc_domain), xcp_id);
- memset(&bp, 0, sizeof(bp));
- bp.size = size;
- bp.byte_align = byte_align;
- bp.domain = alloc_domain;
- bp.flags = alloc_flags;
- bp.type = bo_type;
- bp.resv = NULL;
- ret = amdgpu_bo_create(adev, &bp, &bo);
+ ret = amdgpu_gem_object_create(adev, aligned_size, 1, alloc_domain, alloc_flags,
+ bo_type, NULL, &gobj, xcp_id + 1);
if (ret) {
pr_debug("Failed to create BO on domain %s. ret %d\n",
- domain_string(alloc_domain), ret);
+ domain_string(alloc_domain), ret);
goto err_bo_create;
}
+ ret = drm_vma_node_allow(&gobj->vma_node, drm_priv);
+ if (ret) {
+ pr_debug("Failed to allow vma node access. ret %d\n", ret);
+ goto err_node_allow;
+ }
+ ret = drm_gem_handle_create(adev->kfd.client.file, gobj, &(*mem)->gem_handle);
+ if (ret)
+ goto err_gem_handle_create;
+ bo = gem_to_amdgpu_bo(gobj);
if (bo_type == ttm_bo_type_sg) {
bo->tbo.sg = sg;
bo->tbo.ttm->sg = sg;
@@ -1298,22 +1817,38 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
bo->kfd_bo = *mem;
(*mem)->bo = bo;
if (user_addr)
- bo->flags |= AMDGPU_AMDKFD_USERPTR_BO;
+ bo->flags |= AMDGPU_AMDKFD_CREATE_USERPTR_BO;
(*mem)->va = va;
(*mem)->domain = domain;
(*mem)->mapped_to_gpu_memory = 0;
(*mem)->process_info = avm->process_info;
+
add_kgd_mem_to_kfd_bo_list(*mem, avm->process_info, user_addr);
if (user_addr) {
- ret = init_user_pages(*mem, current->mm, user_addr);
- if (ret) {
- mutex_lock(&avm->process_info->lock);
- list_del(&(*mem)->validate_list.head);
- mutex_unlock(&avm->process_info->lock);
+ pr_debug("creating userptr BO for user_addr = %llx\n", user_addr);
+ ret = init_user_pages(*mem, user_addr, criu_resume);
+ if (ret)
goto allocate_init_user_pages_failed;
+ } else if (flags & (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |
+ KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)) {
+ ret = amdgpu_amdkfd_gpuvm_pin_bo(bo, AMDGPU_GEM_DOMAIN_GTT);
+ if (ret) {
+ pr_err("Pinning MMIO/DOORBELL BO during ALLOC FAILED\n");
+ goto err_pin_bo;
}
+ bo->allowed_domains = AMDGPU_GEM_DOMAIN_GTT;
+ bo->preferred_domains = AMDGPU_GEM_DOMAIN_GTT;
+ } else {
+ mutex_lock(&avm->process_info->lock);
+ if (avm->process_info->eviction_fence &&
+ !dma_fence_is_signaled(&avm->process_info->eviction_fence->base))
+ ret = amdgpu_amdkfd_bo_validate_and_fence(bo, domain,
+ &avm->process_info->eviction_fence->base);
+ mutex_unlock(&avm->process_info->lock);
+ if (ret)
+ goto err_validate_bo;
}
if (offset)
@@ -1322,14 +1857,24 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
return 0;
allocate_init_user_pages_failed:
- amdgpu_bo_unref(&bo);
+err_pin_bo:
+err_validate_bo:
+ remove_kgd_mem_from_kfd_bo_list(*mem, avm->process_info);
+ drm_gem_handle_delete(adev->kfd.client.file, (*mem)->gem_handle);
+err_gem_handle_create:
+ drm_vma_node_revoke(&gobj->vma_node, drm_priv);
+err_node_allow:
/* Don't unreserve system mem limit twice */
goto err_reserve_limit;
err_bo_create:
- unreserve_mem_limit(adev, size, alloc_domain, !!sg);
+ amdgpu_amdkfd_unreserve_mem_limit(adev, aligned_size, flags, xcp_id);
err_reserve_limit:
+ amdgpu_sync_free(&(*mem)->sync);
mutex_destroy(&(*mem)->lock);
- kfree(*mem);
+ if (gobj)
+ drm_gem_object_put(gobj);
+ else
+ kfree(*mem);
err:
if (sg) {
sg_free_table(sg);
@@ -1339,99 +1884,128 @@ err:
}
int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
- struct kgd_dev *kgd, struct kgd_mem *mem)
+ struct amdgpu_device *adev, struct kgd_mem *mem, void *drm_priv,
+ uint64_t *size)
{
struct amdkfd_process_info *process_info = mem->process_info;
- unsigned long bo_size = mem->bo->tbo.mem.size;
- struct kfd_bo_va_list *entry, *tmp;
+ unsigned long bo_size = mem->bo->tbo.base.size;
+ bool use_release_notifier = (mem->bo->kfd_bo == mem);
+ struct kfd_mem_attachment *entry, *tmp;
struct bo_vm_reservation_context ctx;
- struct ttm_validate_buffer *bo_list_entry;
+ unsigned int mapped_to_gpu_memory;
int ret;
+ bool is_imported = false;
mutex_lock(&mem->lock);
- if (mem->mapped_to_gpu_memory > 0) {
- pr_debug("BO VA 0x%llx size 0x%lx is still mapped.\n",
- mem->va, bo_size);
- mutex_unlock(&mem->lock);
- return -EBUSY;
+ /* Unpin MMIO/DOORBELL BO's that were pinned during allocation */
+ if (mem->alloc_flags &
+ (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |
+ KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)) {
+ amdgpu_amdkfd_gpuvm_unpin_bo(mem->bo);
}
+ mapped_to_gpu_memory = mem->mapped_to_gpu_memory;
+ is_imported = mem->is_imported;
mutex_unlock(&mem->lock);
/* lock is not needed after this, since mem is unused and will
* be freed anyway
*/
- /* No more MMU notifiers */
- amdgpu_mn_unregister(mem->bo);
+ if (mapped_to_gpu_memory > 0) {
+ pr_debug("BO VA 0x%llx size 0x%lx is still mapped.\n",
+ mem->va, bo_size);
+ return -EBUSY;
+ }
/* Make sure restore workers don't access the BO any more */
- bo_list_entry = &mem->validate_list;
mutex_lock(&process_info->lock);
- list_del(&bo_list_entry->head);
+ list_del(&mem->validate_list);
mutex_unlock(&process_info->lock);
- /* Free user pages if necessary */
- if (mem->user_pages) {
- pr_debug("%s: Freeing user_pages array\n", __func__);
- if (mem->user_pages[0])
- release_pages(mem->user_pages,
- mem->bo->tbo.ttm->num_pages);
- kvfree(mem->user_pages);
+ /* Cleanup user pages and MMU notifiers */
+ if (amdgpu_ttm_tt_get_usermm(mem->bo->tbo.ttm)) {
+ amdgpu_hmm_unregister(mem->bo);
+ mutex_lock(&process_info->notifier_lock);
+ amdgpu_hmm_range_free(mem->range);
+ mutex_unlock(&process_info->notifier_lock);
}
ret = reserve_bo_and_cond_vms(mem, NULL, BO_VM_ALL, &ctx);
if (unlikely(ret))
return ret;
- /* The eviction fence should be removed by the last unmap.
- * TODO: Log an error condition if the bo still has the eviction fence
- * attached
- */
amdgpu_amdkfd_remove_eviction_fence(mem->bo,
- process_info->eviction_fence,
- NULL, NULL);
+ process_info->eviction_fence);
pr_debug("Release VA 0x%llx - 0x%llx\n", mem->va,
mem->va + bo_size * (1 + mem->aql_queue));
/* Remove from VM internal data structures */
- list_for_each_entry_safe(entry, tmp, &mem->bo_va_list, bo_list)
- remove_bo_from_vm((struct amdgpu_device *)entry->kgd_dev,
- entry, bo_size);
+ list_for_each_entry_safe(entry, tmp, &mem->attachments, list) {
+ kfd_mem_dmaunmap_attachment(mem, entry);
+ kfd_mem_detach(entry);
+ }
ret = unreserve_bo_and_vms(&ctx, false, false);
/* Free the sync object */
amdgpu_sync_free(&mem->sync);
- /* If the SG is not NULL, it's one we created for a doorbell
- * BO. We need to free it.
+ /* If the SG is not NULL, it's one we created for a doorbell or mmio
+ * remap BO. We need to free it.
*/
if (mem->bo->tbo.sg) {
sg_free_table(mem->bo->tbo.sg);
kfree(mem->bo->tbo.sg);
}
+ /* Update the size of the BO being freed if it was allocated from
+ * VRAM and is not imported. For APP APU VRAM allocations are done
+ * in GTT domain
+ */
+ if (size) {
+ if (!is_imported &&
+ mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
+ *size = bo_size;
+ else
+ *size = 0;
+ }
+
/* Free the BO*/
- amdgpu_bo_unref(&mem->bo);
+ drm_vma_node_revoke(&mem->bo->tbo.base.vma_node, drm_priv);
+ drm_gem_handle_delete(adev->kfd.client.file, mem->gem_handle);
+ if (mem->dmabuf) {
+ dma_buf_put(mem->dmabuf);
+ mem->dmabuf = NULL;
+ }
mutex_destroy(&mem->lock);
- kfree(mem);
+
+ /* If this releases the last reference, it will end up calling
+ * amdgpu_amdkfd_release_notify and kfree the mem struct. That's why
+ * this needs to be the last call here.
+ */
+ drm_gem_object_put(&mem->bo->tbo.base);
+
+ /*
+ * For kgd_mem allocated in amdgpu_amdkfd_gpuvm_import_dmabuf(),
+ * explicitly free it here.
+ */
+ if (!use_release_notifier)
+ kfree(mem);
return ret;
}
int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
- struct kgd_dev *kgd, struct kgd_mem *mem, void *vm)
+ struct amdgpu_device *adev, struct kgd_mem *mem,
+ void *drm_priv)
{
- struct amdgpu_device *adev = get_amdgpu_device(kgd);
- struct amdgpu_vm *avm = (struct amdgpu_vm *)vm;
+ struct amdgpu_vm *avm = drm_priv_to_vm(drm_priv);
int ret;
struct amdgpu_bo *bo;
uint32_t domain;
- struct kfd_bo_va_list *entry;
+ struct kfd_mem_attachment *entry;
struct bo_vm_reservation_context ctx;
- struct kfd_bo_va_list *bo_va_entry = NULL;
- struct kfd_bo_va_list *bo_va_entry_aql = NULL;
unsigned long bo_size;
bool is_invalid_userptr = false;
@@ -1447,27 +2021,33 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
*/
mutex_lock(&mem->process_info->lock);
- /* Lock mmap-sem. If we find an invalid userptr BO, we can be
+ /* Lock notifier lock. If we find an invalid userptr BO, we can be
* sure that the MMU notifier is no longer running
* concurrently and the queues are actually stopped
*/
if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) {
- down_write(&current->mm->mmap_sem);
- is_invalid_userptr = atomic_read(&mem->invalid);
- up_write(&current->mm->mmap_sem);
+ mutex_lock(&mem->process_info->notifier_lock);
+ is_invalid_userptr = !!mem->invalid;
+ mutex_unlock(&mem->process_info->notifier_lock);
}
mutex_lock(&mem->lock);
domain = mem->domain;
- bo_size = bo->tbo.mem.size;
+ bo_size = bo->tbo.base.size;
pr_debug("Map VA 0x%llx - 0x%llx to vm %p domain %s\n",
mem->va,
mem->va + bo_size * (1 + mem->aql_queue),
- vm, domain_string(domain));
+ avm, domain_string(domain));
- ret = reserve_bo_and_vm(mem, vm, &ctx);
+ if (!kfd_mem_is_attached(avm, mem)) {
+ ret = kfd_mem_attach(adev, mem, avm, mem->aql_queue);
+ if (ret)
+ goto out;
+ }
+
+ ret = reserve_bo_and_vm(mem, avm, &ctx);
if (unlikely(ret))
goto out;
@@ -1477,80 +2057,44 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
* the next restore worker
*/
if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm) &&
- bo->tbo.mem.mem_type == TTM_PL_SYSTEM)
+ bo->tbo.resource->mem_type == TTM_PL_SYSTEM)
is_invalid_userptr = true;
- if (check_if_add_bo_to_vm(avm, mem)) {
- ret = add_bo_to_vm(adev, mem, avm, false,
- &bo_va_entry);
- if (ret)
- goto add_bo_to_vm_failed;
- if (mem->aql_queue) {
- ret = add_bo_to_vm(adev, mem, avm,
- true, &bo_va_entry_aql);
- if (ret)
- goto add_bo_to_vm_failed_aql;
- }
- } else {
- ret = vm_validate_pt_pd_bos(avm);
- if (unlikely(ret))
- goto add_bo_to_vm_failed;
- }
-
- if (mem->mapped_to_gpu_memory == 0 &&
- !amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) {
- /* Validate BO only once. The eviction fence gets added to BO
- * the first time it is mapped. Validate will wait for all
- * background evictions to complete.
- */
- ret = amdgpu_amdkfd_bo_validate(bo, domain, true);
- if (ret) {
- pr_debug("Validate failed\n");
- goto map_bo_to_gpuvm_failed;
- }
- }
+ ret = vm_validate_pt_pd_bos(avm, NULL);
+ if (unlikely(ret))
+ goto out_unreserve;
- list_for_each_entry(entry, &mem->bo_va_list, bo_list) {
- if (entry->bo_va->base.vm == vm && !entry->is_mapped) {
- pr_debug("\t map VA 0x%llx - 0x%llx in entry %p\n",
- entry->va, entry->va + bo_size,
- entry);
+ list_for_each_entry(entry, &mem->attachments, list) {
+ if (entry->bo_va->base.vm != avm || entry->is_mapped)
+ continue;
- ret = map_bo_to_gpuvm(adev, entry, ctx.sync,
- is_invalid_userptr);
- if (ret) {
- pr_err("Failed to map radeon bo to gpuvm\n");
- goto map_bo_to_gpuvm_failed;
- }
+ pr_debug("\t map VA 0x%llx - 0x%llx in entry %p\n",
+ entry->va, entry->va + bo_size, entry);
- ret = vm_update_pds(vm, ctx.sync);
- if (ret) {
- pr_err("Failed to update page directories\n");
- goto map_bo_to_gpuvm_failed;
- }
+ ret = map_bo_to_gpuvm(mem, entry, ctx.sync,
+ is_invalid_userptr);
+ if (ret) {
+ pr_err("Failed to map bo to gpuvm\n");
+ goto out_unreserve;
+ }
- entry->is_mapped = true;
- mem->mapped_to_gpu_memory++;
- pr_debug("\t INC mapping count %d\n",
- mem->mapped_to_gpu_memory);
+ ret = vm_update_pds(avm, ctx.sync);
+ if (ret) {
+ pr_err("Failed to update page directories\n");
+ goto out_unreserve;
}
+
+ entry->is_mapped = true;
+ mem->mapped_to_gpu_memory++;
+ pr_debug("\t INC mapping count %d\n",
+ mem->mapped_to_gpu_memory);
}
- if (!amdgpu_ttm_tt_get_usermm(bo->tbo.ttm) && !bo->pin_count)
- amdgpu_bo_fence(bo,
- &avm->process_info->eviction_fence->base,
- true);
ret = unreserve_bo_and_vms(&ctx, false, false);
goto out;
-map_bo_to_gpuvm_failed:
- if (bo_va_entry_aql)
- remove_bo_from_vm(adev, bo_va_entry_aql, bo_size);
-add_bo_to_vm_failed_aql:
- if (bo_va_entry)
- remove_bo_from_vm(adev, bo_va_entry, bo_size);
-add_bo_to_vm_failed:
+out_unreserve:
unreserve_bo_and_vms(&ctx, false, false);
out:
mutex_unlock(&mem->process_info->lock);
@@ -1558,20 +2102,49 @@ out:
return ret;
}
+int amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv)
+{
+ struct kfd_mem_attachment *entry;
+ struct amdgpu_vm *vm;
+ int ret;
+
+ vm = drm_priv_to_vm(drm_priv);
+
+ mutex_lock(&mem->lock);
+
+ ret = amdgpu_bo_reserve(mem->bo, true);
+ if (ret)
+ goto out;
+
+ list_for_each_entry(entry, &mem->attachments, list) {
+ if (entry->bo_va->base.vm != vm)
+ continue;
+ if (entry->bo_va->base.bo->tbo.ttm &&
+ !entry->bo_va->base.bo->tbo.ttm->sg)
+ continue;
+
+ kfd_mem_dmaunmap_attachment(mem, entry);
+ }
+
+ amdgpu_bo_unreserve(mem->bo);
+out:
+ mutex_unlock(&mem->lock);
+
+ return ret;
+}
+
int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
- struct kgd_dev *kgd, struct kgd_mem *mem, void *vm)
+ struct amdgpu_device *adev, struct kgd_mem *mem, void *drm_priv)
{
- struct amdgpu_device *adev = get_amdgpu_device(kgd);
- struct amdkfd_process_info *process_info =
- ((struct amdgpu_vm *)vm)->process_info;
- unsigned long bo_size = mem->bo->tbo.mem.size;
- struct kfd_bo_va_list *entry;
+ struct amdgpu_vm *avm = drm_priv_to_vm(drm_priv);
+ unsigned long bo_size = mem->bo->tbo.base.size;
+ struct kfd_mem_attachment *entry;
struct bo_vm_reservation_context ctx;
int ret;
mutex_lock(&mem->lock);
- ret = reserve_bo_and_cond_vms(mem, vm, BO_VM_MAPPED, &ctx);
+ ret = reserve_bo_and_cond_vms(mem, avm, BO_VM_MAPPED, &ctx);
if (unlikely(ret))
goto out;
/* If no VMs were reserved, it means the BO wasn't actually mapped */
@@ -1580,45 +2153,32 @@ int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
goto unreserve_out;
}
- ret = vm_validate_pt_pd_bos((struct amdgpu_vm *)vm);
+ ret = vm_validate_pt_pd_bos(avm, NULL);
if (unlikely(ret))
goto unreserve_out;
pr_debug("Unmap VA 0x%llx - 0x%llx from vm %p\n",
mem->va,
mem->va + bo_size * (1 + mem->aql_queue),
- vm);
-
- list_for_each_entry(entry, &mem->bo_va_list, bo_list) {
- if (entry->bo_va->base.vm == vm && entry->is_mapped) {
- pr_debug("\t unmap VA 0x%llx - 0x%llx from entry %p\n",
- entry->va,
- entry->va + bo_size,
- entry);
-
- ret = unmap_bo_from_gpuvm(adev, entry, ctx.sync);
- if (ret == 0) {
- entry->is_mapped = false;
- } else {
- pr_err("failed to unmap VA 0x%llx\n",
- mem->va);
- goto unreserve_out;
- }
+ avm);
- mem->mapped_to_gpu_memory--;
- pr_debug("\t DEC mapping count %d\n",
- mem->mapped_to_gpu_memory);
- }
- }
+ list_for_each_entry(entry, &mem->attachments, list) {
+ if (entry->bo_va->base.vm != avm || !entry->is_mapped)
+ continue;
- /* If BO is unmapped from all VMs, unfence it. It can be evicted if
- * required.
- */
- if (mem->mapped_to_gpu_memory == 0 &&
- !amdgpu_ttm_tt_get_usermm(mem->bo->tbo.ttm) && !mem->bo->pin_count)
- amdgpu_amdkfd_remove_eviction_fence(mem->bo,
- process_info->eviction_fence,
- NULL, NULL);
+ pr_debug("\t unmap VA 0x%llx - 0x%llx from entry %p\n",
+ entry->va, entry->va + bo_size, entry);
+
+ ret = unmap_bo_from_gpuvm(mem, entry, ctx.sync);
+ if (ret)
+ goto unreserve_out;
+
+ entry->is_mapped = false;
+
+ mem->mapped_to_gpu_memory--;
+ pr_debug("\t DEC mapping count %d\n",
+ mem->mapped_to_gpu_memory);
+ }
unreserve_out:
unreserve_bo_and_vms(&ctx, false, false);
@@ -1628,7 +2188,7 @@ out:
}
int amdgpu_amdkfd_gpuvm_sync_memory(
- struct kgd_dev *kgd, struct kgd_mem *mem, bool intr)
+ struct amdgpu_device *adev, struct kgd_mem *mem, bool intr)
{
struct amdgpu_sync sync;
int ret;
@@ -1644,8 +2204,69 @@ int amdgpu_amdkfd_gpuvm_sync_memory(
return ret;
}
-int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd,
- struct kgd_mem *mem, void **kptr, uint64_t *size)
+/**
+ * amdgpu_amdkfd_map_gtt_bo_to_gart - Map BO to GART and increment reference count
+ * @bo: Buffer object to be mapped
+ * @bo_gart: Return bo reference
+ *
+ * Before return, bo reference count is incremented. To release the reference and unpin/
+ * unmap the BO, call amdgpu_amdkfd_free_gtt_mem.
+ */
+int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo, struct amdgpu_bo **bo_gart)
+{
+ int ret;
+
+ ret = amdgpu_bo_reserve(bo, true);
+ if (ret) {
+ pr_err("Failed to reserve bo. ret %d\n", ret);
+ goto err_reserve_bo_failed;
+ }
+
+ ret = amdgpu_bo_pin(bo, AMDGPU_GEM_DOMAIN_GTT);
+ if (ret) {
+ pr_err("Failed to pin bo. ret %d\n", ret);
+ goto err_pin_bo_failed;
+ }
+
+ ret = amdgpu_ttm_alloc_gart(&bo->tbo);
+ if (ret) {
+ pr_err("Failed to bind bo to GART. ret %d\n", ret);
+ goto err_map_bo_gart_failed;
+ }
+
+ amdgpu_amdkfd_remove_eviction_fence(
+ bo, bo->vm_bo->vm->process_info->eviction_fence);
+
+ amdgpu_bo_unreserve(bo);
+
+ *bo_gart = amdgpu_bo_ref(bo);
+
+ return 0;
+
+err_map_bo_gart_failed:
+ amdgpu_bo_unpin(bo);
+err_pin_bo_failed:
+ amdgpu_bo_unreserve(bo);
+err_reserve_bo_failed:
+
+ return ret;
+}
+
+/** amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel() - Map a GTT BO for kernel CPU access
+ *
+ * @mem: Buffer object to be mapped for CPU access
+ * @kptr[out]: pointer in kernel CPU address space
+ * @size[out]: size of the buffer
+ *
+ * Pins the BO and maps it for kernel CPU access. The eviction fence is removed
+ * from the BO, since pinned BOs cannot be evicted. The bo must remain on the
+ * validate_list, so the GPU mapping can be restored after a page table was
+ * evicted.
+ *
+ * Return: 0 on success, error code on failure
+ */
+int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem *mem,
+ void **kptr, uint64_t *size)
{
int ret;
struct amdgpu_bo *bo = mem->bo;
@@ -1655,9 +2276,6 @@ int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd,
return -EINVAL;
}
- /* delete kgd_mem from kfd_bo_list to avoid re-validating
- * this BO in BO's restoring after eviction.
- */
mutex_lock(&mem->process_info->lock);
ret = amdgpu_bo_reserve(bo, true);
@@ -1679,8 +2297,7 @@ int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd,
}
amdgpu_amdkfd_remove_eviction_fence(
- bo, mem->process_info->eviction_fence, NULL, NULL);
- list_del_init(&mem->validate_list.head);
+ bo, mem->process_info->eviction_fence);
if (size)
*size = amdgpu_bo_size(bo);
@@ -1700,39 +2317,44 @@ bo_reserve_failed:
return ret;
}
-int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd,
- struct kfd_vm_fault_info *mem)
+/** amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel() - Unmap a GTT BO for kernel CPU access
+ *
+ * @mem: Buffer object to be unmapped for CPU access
+ *
+ * Removes the kernel CPU mapping and unpins the BO. It does not restore the
+ * eviction fence, so this function should only be used for cleanup before the
+ * BO is destroyed.
+ */
+void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_mem *mem)
{
- struct amdgpu_device *adev;
+ struct amdgpu_bo *bo = mem->bo;
- adev = (struct amdgpu_device *)kgd;
- if (atomic_read(&adev->gmc.vm_fault_info_updated) == 1) {
+ (void)amdgpu_bo_reserve(bo, true);
+ amdgpu_bo_kunmap(bo);
+ amdgpu_bo_unpin(bo);
+ amdgpu_bo_unreserve(bo);
+}
+
+int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct amdgpu_device *adev,
+ struct kfd_vm_fault_info *mem)
+{
+ if (atomic_read_acquire(&adev->gmc.vm_fault_info_updated) == 1) {
*mem = *adev->gmc.vm_fault_info;
- mb();
- atomic_set(&adev->gmc.vm_fault_info_updated, 0);
+ atomic_set_release(&adev->gmc.vm_fault_info_updated, 0);
}
return 0;
}
-int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd,
- struct dma_buf *dma_buf,
- uint64_t va, void *vm,
- struct kgd_mem **mem, uint64_t *size,
- uint64_t *mmap_offset)
+static int import_obj_create(struct amdgpu_device *adev,
+ struct dma_buf *dma_buf,
+ struct drm_gem_object *obj,
+ uint64_t va, void *drm_priv,
+ struct kgd_mem **mem, uint64_t *size,
+ uint64_t *mmap_offset)
{
- struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
- struct drm_gem_object *obj;
+ struct amdgpu_vm *avm = drm_priv_to_vm(drm_priv);
struct amdgpu_bo *bo;
- struct amdgpu_vm *avm = (struct amdgpu_vm *)vm;
-
- if (dma_buf->ops != &amdgpu_dmabuf_ops)
- /* Can't handle non-graphics buffers */
- return -EINVAL;
-
- obj = dma_buf->priv;
- if (obj->dev->dev_private != adev)
- /* Can't handle buffers from other devices */
- return -EINVAL;
+ int ret;
bo = gem_to_amdgpu_bo(obj);
if (!(bo->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM |
@@ -1744,58 +2366,150 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd,
if (!*mem)
return -ENOMEM;
+ ret = drm_vma_node_allow(&obj->vma_node, drm_priv);
+ if (ret)
+ goto err_free_mem;
+
if (size)
*size = amdgpu_bo_size(bo);
if (mmap_offset)
*mmap_offset = amdgpu_bo_mmap_offset(bo);
- INIT_LIST_HEAD(&(*mem)->bo_va_list);
+ INIT_LIST_HEAD(&(*mem)->attachments);
mutex_init(&(*mem)->lock);
- (*mem)->mapping_flags =
- AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE |
- AMDGPU_VM_PAGE_EXECUTABLE | AMDGPU_VM_MTYPE_NC;
- (*mem)->bo = amdgpu_bo_ref(bo);
+ (*mem)->alloc_flags =
+ ((bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) ?
+ KFD_IOC_ALLOC_MEM_FLAGS_VRAM : KFD_IOC_ALLOC_MEM_FLAGS_GTT)
+ | KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE
+ | KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE;
+
+ get_dma_buf(dma_buf);
+ (*mem)->dmabuf = dma_buf;
+ (*mem)->bo = bo;
(*mem)->va = va;
- (*mem)->domain = (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) ?
- AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT;
+ (*mem)->domain = (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) &&
+ !adev->apu_prefer_gtt ?
+ AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT;
+
(*mem)->mapped_to_gpu_memory = 0;
(*mem)->process_info = avm->process_info;
add_kgd_mem_to_kfd_bo_list(*mem, avm->process_info, false);
amdgpu_sync_create(&(*mem)->sync);
+ (*mem)->is_imported = true;
+
+ mutex_lock(&avm->process_info->lock);
+ if (avm->process_info->eviction_fence &&
+ !dma_fence_is_signaled(&avm->process_info->eviction_fence->base))
+ ret = amdgpu_amdkfd_bo_validate_and_fence(bo, (*mem)->domain,
+ &avm->process_info->eviction_fence->base);
+ mutex_unlock(&avm->process_info->lock);
+ if (ret)
+ goto err_remove_mem;
+
+ return 0;
+
+err_remove_mem:
+ remove_kgd_mem_from_kfd_bo_list(*mem, avm->process_info);
+ drm_vma_node_revoke(&obj->vma_node, drm_priv);
+err_free_mem:
+ kfree(*mem);
+ return ret;
+}
+
+int amdgpu_amdkfd_gpuvm_import_dmabuf_fd(struct amdgpu_device *adev, int fd,
+ uint64_t va, void *drm_priv,
+ struct kgd_mem **mem, uint64_t *size,
+ uint64_t *mmap_offset)
+{
+ struct drm_gem_object *obj;
+ uint32_t handle;
+ int ret;
+
+ ret = drm_gem_prime_fd_to_handle(&adev->ddev, adev->kfd.client.file, fd,
+ &handle);
+ if (ret)
+ return ret;
+ obj = drm_gem_object_lookup(adev->kfd.client.file, handle);
+ if (!obj) {
+ ret = -EINVAL;
+ goto err_release_handle;
+ }
+
+ ret = import_obj_create(adev, obj->dma_buf, obj, va, drm_priv, mem, size,
+ mmap_offset);
+ if (ret)
+ goto err_put_obj;
+
+ (*mem)->gem_handle = handle;
return 0;
+
+err_put_obj:
+ drm_gem_object_put(obj);
+err_release_handle:
+ drm_gem_handle_delete(adev->kfd.client.file, handle);
+ return ret;
+}
+
+int amdgpu_amdkfd_gpuvm_export_dmabuf(struct kgd_mem *mem,
+ struct dma_buf **dma_buf)
+{
+ int ret;
+
+ mutex_lock(&mem->lock);
+ ret = kfd_mem_export_dmabuf(mem);
+ if (ret)
+ goto out;
+
+ get_dma_buf(mem->dmabuf);
+ *dma_buf = mem->dmabuf;
+out:
+ mutex_unlock(&mem->lock);
+ return ret;
}
/* Evict a userptr BO by stopping the queues if necessary
*
* Runs in MMU notifier, may be in RECLAIM_FS context. This means it
* cannot do any memory allocations, and cannot take any locks that
- * are held elsewhere while allocating memory. Therefore this is as
- * simple as possible, using atomic counters.
+ * are held elsewhere while allocating memory.
*
* It doesn't do anything to the BO itself. The real work happens in
* restore, where we get updated page addresses. This function only
* ensures that GPU access to the BO is stopped.
*/
-int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem,
- struct mm_struct *mm)
+int amdgpu_amdkfd_evict_userptr(struct mmu_interval_notifier *mni,
+ unsigned long cur_seq, struct kgd_mem *mem)
{
struct amdkfd_process_info *process_info = mem->process_info;
- int invalid, evicted_bos;
int r = 0;
- invalid = atomic_inc_return(&mem->invalid);
- evicted_bos = atomic_inc_return(&process_info->evicted_bos);
- if (evicted_bos == 1) {
+ /* Do not process MMU notifications during CRIU restore until
+ * KFD_CRIU_OP_RESUME IOCTL is received
+ */
+ if (READ_ONCE(process_info->block_mmu_notifications))
+ return 0;
+
+ mutex_lock(&process_info->notifier_lock);
+ mmu_interval_set_seq(mni, cur_seq);
+
+ mem->invalid++;
+ if (++process_info->evicted_bos == 1) {
/* First eviction, stop the queues */
- r = kgd2kfd->quiesce_mm(mm);
- if (r)
+ r = kgd2kfd_quiesce_mm(mni->mm,
+ KFD_QUEUE_EVICTION_TRIGGER_USERPTR);
+
+ if (r && r != -ESRCH)
pr_err("Failed to quiesce KFD\n");
- schedule_delayed_work(&process_info->restore_userptr_work,
- msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
+
+ if (r != -ESRCH)
+ queue_delayed_work(system_freezable_wq,
+ &process_info->restore_userptr_work,
+ msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
}
+ mutex_unlock(&process_info->notifier_lock);
return r;
}
@@ -1812,165 +2526,174 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info,
struct kgd_mem *mem, *tmp_mem;
struct amdgpu_bo *bo;
struct ttm_operation_ctx ctx = { false, false };
- int invalid, ret;
+ uint32_t invalid;
+ int ret = 0;
- /* Move all invalidated BOs to the userptr_inval_list and
- * release their user pages by migration to the CPU domain
- */
+ mutex_lock(&process_info->notifier_lock);
+
+ /* Move all invalidated BOs to the userptr_inval_list */
list_for_each_entry_safe(mem, tmp_mem,
&process_info->userptr_valid_list,
- validate_list.head) {
- if (!atomic_read(&mem->invalid))
- continue; /* BO is still valid */
-
- bo = mem->bo;
-
- if (amdgpu_bo_reserve(bo, true))
- return -EAGAIN;
- amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU);
- ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
- amdgpu_bo_unreserve(bo);
- if (ret) {
- pr_err("%s: Failed to invalidate userptr BO\n",
- __func__);
- return -EAGAIN;
- }
-
- list_move_tail(&mem->validate_list.head,
- &process_info->userptr_inval_list);
- }
-
- if (list_empty(&process_info->userptr_inval_list))
- return 0; /* All evicted userptr BOs were freed */
+ validate_list)
+ if (mem->invalid)
+ list_move_tail(&mem->validate_list,
+ &process_info->userptr_inval_list);
/* Go through userptr_inval_list and update any invalid user_pages */
list_for_each_entry(mem, &process_info->userptr_inval_list,
- validate_list.head) {
- invalid = atomic_read(&mem->invalid);
+ validate_list) {
+ invalid = mem->invalid;
if (!invalid)
/* BO hasn't been invalidated since the last
- * revalidation attempt. Keep its BO list.
+ * revalidation attempt. Keep its page list.
*/
continue;
bo = mem->bo;
- if (!mem->user_pages) {
- mem->user_pages =
- kvmalloc_array(bo->tbo.ttm->num_pages,
- sizeof(struct page *),
- GFP_KERNEL | __GFP_ZERO);
- if (!mem->user_pages) {
- pr_err("%s: Failed to allocate pages array\n",
+ amdgpu_hmm_range_free(mem->range);
+ mem->range = NULL;
+
+ /* BO reservations and getting user pages (hmm_range_fault)
+ * must happen outside the notifier lock
+ */
+ mutex_unlock(&process_info->notifier_lock);
+
+ /* Move the BO to system (CPU) domain if necessary to unmap
+ * and free the SG table
+ */
+ if (bo->tbo.resource->mem_type != TTM_PL_SYSTEM) {
+ if (amdgpu_bo_reserve(bo, true))
+ return -EAGAIN;
+ amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU);
+ ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
+ amdgpu_bo_unreserve(bo);
+ if (ret) {
+ pr_err("%s: Failed to invalidate userptr BO\n",
__func__);
- return -ENOMEM;
+ return -EAGAIN;
}
- } else if (mem->user_pages[0]) {
- release_pages(mem->user_pages, bo->tbo.ttm->num_pages);
}
+ mem->range = amdgpu_hmm_range_alloc(NULL);
+ if (unlikely(!mem->range))
+ return -ENOMEM;
/* Get updated user pages */
- ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm,
- mem->user_pages);
+ ret = amdgpu_ttm_tt_get_user_pages(bo, mem->range);
if (ret) {
- mem->user_pages[0] = NULL;
- pr_info("%s: Failed to get user pages: %d\n",
- __func__, ret);
- /* Pretend it succeeded. It will fail later
- * with a VM fault if the GPU tries to access
- * it. Better than hanging indefinitely with
- * stalled user mode queues.
+ amdgpu_hmm_range_free(mem->range);
+ mem->range = NULL;
+ pr_debug("Failed %d to get user pages\n", ret);
+
+ /* Return -EFAULT bad address error as success. It will
+ * fail later with a VM fault if the GPU tries to access
+ * it. Better than hanging indefinitely with stalled
+ * user mode queues.
+ *
+ * Return other error -EBUSY or -ENOMEM to retry restore
*/
+ if (ret != -EFAULT)
+ return ret;
+
+ /* If applications unmap memory before destroying the userptr
+ * from the KFD, trigger a segmentation fault in VM debug mode.
+ */
+ if (amdgpu_ttm_adev(bo->tbo.bdev)->debug_vm_userptr) {
+ struct kfd_process *p;
+
+ pr_err("Pid %d unmapped memory before destroying userptr at GPU addr 0x%llx\n",
+ pid_nr(process_info->pid), mem->va);
+
+ // Send GPU VM fault to user space
+ p = kfd_lookup_process_by_pid(process_info->pid);
+ if (p) {
+ kfd_signal_vm_fault_event_with_userptr(p, mem->va);
+ kfd_unref_process(p);
+ }
+ }
+
+ ret = 0;
}
+ amdgpu_ttm_tt_set_user_pages(bo->tbo.ttm, mem->range);
+
+ mutex_lock(&process_info->notifier_lock);
+
/* Mark the BO as valid unless it was invalidated
- * again concurrently
+ * again concurrently.
*/
- if (atomic_cmpxchg(&mem->invalid, invalid, 0) != invalid)
- return -EAGAIN;
+ if (mem->invalid != invalid) {
+ ret = -EAGAIN;
+ goto unlock_out;
+ }
+ /* set mem valid if mem has hmm range associated */
+ if (mem->range)
+ mem->invalid = 0;
}
- return 0;
+unlock_out:
+ mutex_unlock(&process_info->notifier_lock);
+
+ return ret;
}
/* Validate invalid userptr BOs
*
- * Validates BOs on the userptr_inval_list, and moves them back to the
- * userptr_valid_list. Also updates GPUVM page tables with new page
- * addresses and waits for the page table updates to complete.
+ * Validates BOs on the userptr_inval_list. Also updates GPUVM page tables
+ * with new page addresses and waits for the page table updates to complete.
*/
static int validate_invalid_user_pages(struct amdkfd_process_info *process_info)
{
- struct amdgpu_bo_list_entry *pd_bo_list_entries;
- struct list_head resv_list, duplicates;
- struct ww_acquire_ctx ticket;
+ struct ttm_operation_ctx ctx = { false, false };
struct amdgpu_sync sync;
+ struct drm_exec exec;
struct amdgpu_vm *peer_vm;
struct kgd_mem *mem, *tmp_mem;
struct amdgpu_bo *bo;
- struct ttm_operation_ctx ctx = { false, false };
- int i, ret;
-
- pd_bo_list_entries = kcalloc(process_info->n_vms,
- sizeof(struct amdgpu_bo_list_entry),
- GFP_KERNEL);
- if (!pd_bo_list_entries) {
- pr_err("%s: Failed to allocate PD BO list entries\n", __func__);
- return -ENOMEM;
- }
-
- INIT_LIST_HEAD(&resv_list);
- INIT_LIST_HEAD(&duplicates);
+ int ret;
- /* Get all the page directory BOs that need to be reserved */
- i = 0;
- list_for_each_entry(peer_vm, &process_info->vm_list_head,
- vm_list_node)
- amdgpu_vm_get_pd_bo(peer_vm, &resv_list,
- &pd_bo_list_entries[i++]);
- /* Add the userptr_inval_list entries to resv_list */
- list_for_each_entry(mem, &process_info->userptr_inval_list,
- validate_list.head) {
- list_add_tail(&mem->resv_list.head, &resv_list);
- mem->resv_list.bo = mem->validate_list.bo;
- mem->resv_list.num_shared = mem->validate_list.num_shared;
- }
+ amdgpu_sync_create(&sync);
+ drm_exec_init(&exec, 0, 0);
/* Reserve all BOs and page tables for validation */
- ret = ttm_eu_reserve_buffers(&ticket, &resv_list, false, &duplicates);
- WARN(!list_empty(&duplicates), "Duplicates should be empty");
- if (ret)
- goto out;
+ drm_exec_until_all_locked(&exec) {
+ /* Reserve all the page directories */
+ list_for_each_entry(peer_vm, &process_info->vm_list_head,
+ vm_list_node) {
+ ret = amdgpu_vm_lock_pd(peer_vm, &exec, 2);
+ drm_exec_retry_on_contention(&exec);
+ if (unlikely(ret))
+ goto unreserve_out;
+ }
- amdgpu_sync_create(&sync);
+ /* Reserve the userptr_inval_list entries to resv_list */
+ list_for_each_entry(mem, &process_info->userptr_inval_list,
+ validate_list) {
+ struct drm_gem_object *gobj;
- /* Avoid triggering eviction fences when unmapping invalid
- * userptr BOs (waits for all fences, doesn't use
- * FENCE_OWNER_VM)
- */
- list_for_each_entry(peer_vm, &process_info->vm_list_head,
- vm_list_node)
- amdgpu_amdkfd_remove_eviction_fence(peer_vm->root.base.bo,
- process_info->eviction_fence,
- NULL, NULL);
+ gobj = &mem->bo->tbo.base;
+ ret = drm_exec_prepare_obj(&exec, gobj, 1);
+ drm_exec_retry_on_contention(&exec);
+ if (unlikely(ret))
+ goto unreserve_out;
+ }
+ }
- ret = process_validate_vms(process_info);
+ ret = process_validate_vms(process_info, NULL);
if (ret)
goto unreserve_out;
/* Validate BOs and update GPUVM page tables */
list_for_each_entry_safe(mem, tmp_mem,
&process_info->userptr_inval_list,
- validate_list.head) {
- struct kfd_bo_va_list *bo_va_entry;
+ validate_list) {
+ struct kfd_mem_attachment *attachment;
bo = mem->bo;
- /* Copy pages array and validate the BO if we got user pages */
- if (mem->user_pages[0]) {
- amdgpu_ttm_tt_set_user_pages(bo->tbo.ttm,
- mem->user_pages);
+ /* Validate the BO if we got user pages */
+ if (bo->tbo.ttm->pages[0]) {
amdgpu_bo_placement_from_domain(bo, mem->domain);
ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
if (ret) {
@@ -1979,33 +2702,24 @@ static int validate_invalid_user_pages(struct amdkfd_process_info *process_info)
}
}
- /* Validate succeeded, now the BO owns the pages, free
- * our copy of the pointer array. Put this BO back on
- * the userptr_valid_list. If we need to revalidate
- * it, we need to start from scratch.
- */
- kvfree(mem->user_pages);
- mem->user_pages = NULL;
- list_move_tail(&mem->validate_list.head,
- &process_info->userptr_valid_list);
-
/* Update mapping. If the BO was not validated
* (because we couldn't get user pages), this will
* clear the page table entries, which will result in
* VM faults if the GPU tries to access the invalid
* memory.
*/
- list_for_each_entry(bo_va_entry, &mem->bo_va_list, bo_list) {
- if (!bo_va_entry->is_mapped)
+ list_for_each_entry(attachment, &mem->attachments, list) {
+ if (!attachment->is_mapped)
continue;
- ret = update_gpuvm_pte((struct amdgpu_device *)
- bo_va_entry->kgd_dev,
- bo_va_entry, &sync);
+ kfd_mem_dmaunmap_attachment(mem, attachment);
+ ret = update_gpuvm_pte(mem, attachment, &sync);
if (ret) {
pr_err("%s: update PTE failed\n", __func__);
/* make sure this gets validated again */
- atomic_inc(&mem->invalid);
+ mutex_lock(&process_info->notifier_lock);
+ mem->invalid++;
+ mutex_unlock(&process_info->notifier_lock);
goto unreserve_out;
}
}
@@ -2015,15 +2729,51 @@ static int validate_invalid_user_pages(struct amdkfd_process_info *process_info)
ret = process_update_pds(process_info, &sync);
unreserve_out:
- list_for_each_entry(peer_vm, &process_info->vm_list_head,
- vm_list_node)
- amdgpu_bo_fence(peer_vm->root.base.bo,
- &process_info->eviction_fence->base, true);
- ttm_eu_backoff_reservation(&ticket, &resv_list);
+ drm_exec_fini(&exec);
amdgpu_sync_wait(&sync, false);
amdgpu_sync_free(&sync);
-out:
- kfree(pd_bo_list_entries);
+
+ return ret;
+}
+
+/* Confirm that all user pages are valid while holding the notifier lock
+ *
+ * Moves valid BOs from the userptr_inval_list back to userptr_val_list.
+ */
+static int confirm_valid_user_pages_locked(struct amdkfd_process_info *process_info)
+{
+ struct kgd_mem *mem, *tmp_mem;
+ int ret = 0;
+
+ list_for_each_entry_safe(mem, tmp_mem,
+ &process_info->userptr_inval_list,
+ validate_list) {
+ bool valid;
+
+ /* keep mem without hmm range at userptr_inval_list */
+ if (!mem->range)
+ continue;
+
+ /* Only check mem with hmm range associated */
+ valid = amdgpu_hmm_range_valid(mem->range);
+ amdgpu_hmm_range_free(mem->range);
+
+ mem->range = NULL;
+ if (!valid) {
+ WARN(!mem->invalid, "Invalid BO not marked invalid");
+ ret = -EAGAIN;
+ continue;
+ }
+
+ if (mem->invalid) {
+ WARN(1, "Valid BO is marked invalid");
+ ret = -EAGAIN;
+ continue;
+ }
+
+ list_move_tail(&mem->validate_list,
+ &process_info->userptr_valid_list);
+ }
return ret;
}
@@ -2042,9 +2792,11 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
restore_userptr_work);
struct task_struct *usertask;
struct mm_struct *mm;
- int evicted_bos;
+ uint32_t evicted_bos;
- evicted_bos = atomic_read(&process_info->evicted_bos);
+ mutex_lock(&process_info->notifier_lock);
+ evicted_bos = process_info->evicted_bos;
+ mutex_unlock(&process_info->notifier_lock);
if (!evicted_bos)
return;
@@ -2067,9 +2819,6 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
* and we can just restart the queues.
*/
if (!list_empty(&process_info->userptr_inval_list)) {
- if (atomic_read(&process_info->evicted_bos) != evicted_bos)
- goto unlock_out; /* Concurrent eviction, try again */
-
if (validate_invalid_user_pages(process_info))
goto unlock_out;
}
@@ -2078,25 +2827,56 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
* be a first eviction that calls quiesce_mm. The eviction
* reference counting inside KFD will handle this case.
*/
- if (atomic_cmpxchg(&process_info->evicted_bos, evicted_bos, 0) !=
- evicted_bos)
- goto unlock_out;
- evicted_bos = 0;
- if (kgd2kfd->resume_mm(mm)) {
+ mutex_lock(&process_info->notifier_lock);
+ if (process_info->evicted_bos != evicted_bos)
+ goto unlock_notifier_out;
+
+ if (confirm_valid_user_pages_locked(process_info)) {
+ WARN(1, "User pages unexpectedly invalid");
+ goto unlock_notifier_out;
+ }
+
+ process_info->evicted_bos = evicted_bos = 0;
+
+ if (kgd2kfd_resume_mm(mm)) {
pr_err("%s: Failed to resume KFD\n", __func__);
/* No recovery from this failure. Probably the CP is
* hanging. No point trying again.
*/
}
+
+unlock_notifier_out:
+ mutex_unlock(&process_info->notifier_lock);
unlock_out:
mutex_unlock(&process_info->lock);
- mmput(mm);
- put_task_struct(usertask);
/* If validation failed, reschedule another attempt */
- if (evicted_bos)
- schedule_delayed_work(&process_info->restore_userptr_work,
+ if (evicted_bos) {
+ queue_delayed_work(system_freezable_wq,
+ &process_info->restore_userptr_work,
msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
+
+ kfd_smi_event_queue_restore_rescheduled(mm);
+ }
+ mmput(mm);
+ put_task_struct(usertask);
+}
+
+static void replace_eviction_fence(struct dma_fence __rcu **ef,
+ struct dma_fence *new_ef)
+{
+ struct dma_fence *old_ef = rcu_replace_pointer(*ef, new_ef, true
+ /* protected by process_info->lock */);
+
+ /* If we're replacing an unsignaled eviction fence, that fence will
+ * never be signaled, and if anyone is still waiting on that fence,
+ * they will hang forever. This should never happen. We should only
+ * replace the fence in restore_work that only gets scheduled after
+ * eviction work signaled the fence.
+ */
+ WARN_ONCE(!dma_fence_is_signaled(old_ef),
+ "Replacing unsignaled eviction fence");
+ dma_fence_put(old_ef);
}
/** amdgpu_amdkfd_gpuvm_restore_process_bos - Restore all BOs for the given
@@ -2117,89 +2897,108 @@ unlock_out:
* 7. Add fence to all PD and PT BOs.
* 8. Unreserve all BOs
*/
-int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence **ef)
+int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu **ef)
{
- struct amdgpu_bo_list_entry *pd_bo_list;
struct amdkfd_process_info *process_info = info;
struct amdgpu_vm *peer_vm;
struct kgd_mem *mem;
- struct bo_vm_reservation_context ctx;
- struct amdgpu_amdkfd_fence *new_fence;
- int ret = 0, i;
struct list_head duplicate_save;
struct amdgpu_sync sync_obj;
+ unsigned long failed_size = 0;
+ unsigned long total_size = 0;
+ struct drm_exec exec;
+ int ret;
INIT_LIST_HEAD(&duplicate_save);
- INIT_LIST_HEAD(&ctx.list);
- INIT_LIST_HEAD(&ctx.duplicates);
- pd_bo_list = kcalloc(process_info->n_vms,
- sizeof(struct amdgpu_bo_list_entry),
- GFP_KERNEL);
- if (!pd_bo_list)
- return -ENOMEM;
-
- i = 0;
mutex_lock(&process_info->lock);
- list_for_each_entry(peer_vm, &process_info->vm_list_head,
- vm_list_node)
- amdgpu_vm_get_pd_bo(peer_vm, &ctx.list, &pd_bo_list[i++]);
-
- /* Reserve all BOs and page tables/directory. Add all BOs from
- * kfd_bo_list to ctx.list
- */
- list_for_each_entry(mem, &process_info->kfd_bo_list,
- validate_list.head) {
- list_add_tail(&mem->resv_list.head, &ctx.list);
- mem->resv_list.bo = mem->validate_list.bo;
- mem->resv_list.num_shared = mem->validate_list.num_shared;
- }
+ drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0);
+ drm_exec_until_all_locked(&exec) {
+ list_for_each_entry(peer_vm, &process_info->vm_list_head,
+ vm_list_node) {
+ ret = amdgpu_vm_lock_pd(peer_vm, &exec, 2);
+ drm_exec_retry_on_contention(&exec);
+ if (unlikely(ret)) {
+ pr_err("Locking VM PD failed, ret: %d\n", ret);
+ goto ttm_reserve_fail;
+ }
+ }
- ret = ttm_eu_reserve_buffers(&ctx.ticket, &ctx.list,
- false, &duplicate_save);
- if (ret) {
- pr_debug("Memory eviction: TTM Reserve Failed. Try again\n");
- goto ttm_reserve_fail;
+ /* Reserve all BOs and page tables/directory. Add all BOs from
+ * kfd_bo_list to ctx.list
+ */
+ list_for_each_entry(mem, &process_info->kfd_bo_list,
+ validate_list) {
+ struct drm_gem_object *gobj;
+
+ gobj = &mem->bo->tbo.base;
+ ret = drm_exec_prepare_obj(&exec, gobj, 1);
+ drm_exec_retry_on_contention(&exec);
+ if (unlikely(ret)) {
+ pr_err("drm_exec_prepare_obj failed, ret: %d\n", ret);
+ goto ttm_reserve_fail;
+ }
+ }
}
amdgpu_sync_create(&sync_obj);
- /* Validate PDs and PTs */
- ret = process_validate_vms(process_info);
- if (ret)
- goto validate_map_fail;
-
- ret = process_sync_pds_resv(process_info, &sync_obj);
- if (ret) {
- pr_debug("Memory eviction: Failed to sync to PD BO moving fence. Try again\n");
- goto validate_map_fail;
- }
-
- /* Validate BOs and map them to GPUVM (update VM page tables). */
+ /* Validate BOs managed by KFD */
list_for_each_entry(mem, &process_info->kfd_bo_list,
- validate_list.head) {
+ validate_list) {
struct amdgpu_bo *bo = mem->bo;
uint32_t domain = mem->domain;
- struct kfd_bo_va_list *bo_va_entry;
+ struct dma_resv_iter cursor;
+ struct dma_fence *fence;
+
+ total_size += amdgpu_bo_size(bo);
ret = amdgpu_amdkfd_bo_validate(bo, domain, false);
if (ret) {
- pr_debug("Memory eviction: Validate BOs failed. Try again\n");
- goto validate_map_fail;
+ pr_debug("Memory eviction: Validate BOs failed\n");
+ failed_size += amdgpu_bo_size(bo);
+ ret = amdgpu_amdkfd_bo_validate(bo,
+ AMDGPU_GEM_DOMAIN_GTT, false);
+ if (ret) {
+ pr_debug("Memory eviction: Try again\n");
+ goto validate_map_fail;
+ }
}
- ret = amdgpu_sync_fence(NULL, &sync_obj, bo->tbo.moving, false);
- if (ret) {
- pr_debug("Memory eviction: Sync BO fence failed. Try again\n");
- goto validate_map_fail;
+ dma_resv_for_each_fence(&cursor, bo->tbo.base.resv,
+ DMA_RESV_USAGE_KERNEL, fence) {
+ ret = amdgpu_sync_fence(&sync_obj, fence, GFP_KERNEL);
+ if (ret) {
+ pr_debug("Memory eviction: Sync BO fence failed. Try again\n");
+ goto validate_map_fail;
+ }
}
- list_for_each_entry(bo_va_entry, &mem->bo_va_list,
- bo_list) {
- ret = update_gpuvm_pte((struct amdgpu_device *)
- bo_va_entry->kgd_dev,
- bo_va_entry,
- &sync_obj);
+ }
+
+ if (failed_size)
+ pr_debug("0x%lx/0x%lx in system\n", failed_size, total_size);
+
+ /* Validate PDs, PTs and evicted DMABuf imports last. Otherwise BO
+ * validations above would invalidate DMABuf imports again.
+ */
+ ret = process_validate_vms(process_info, &exec.ticket);
+ if (ret) {
+ pr_debug("Validating VMs failed, ret: %d\n", ret);
+ goto validate_map_fail;
+ }
+
+ /* Update mappings managed by KFD. */
+ list_for_each_entry(mem, &process_info->kfd_bo_list,
+ validate_list) {
+ struct kfd_mem_attachment *attachment;
+
+ list_for_each_entry(attachment, &mem->attachments, list) {
+ if (!attachment->is_mapped)
+ continue;
+
+ kfd_mem_dmaunmap_attachment(mem, attachment);
+ ret = update_gpuvm_pte(mem, attachment, &sync_obj);
if (ret) {
pr_debug("Memory eviction: update PTE failed. Try again\n");
goto validate_map_fail;
@@ -2207,6 +3006,32 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence **ef)
}
}
+ /* Update mappings not managed by KFD */
+ list_for_each_entry(peer_vm, &process_info->vm_list_head,
+ vm_list_node) {
+ struct amdgpu_device *adev = amdgpu_ttm_adev(
+ peer_vm->root.bo->tbo.bdev);
+
+ struct amdgpu_fpriv *fpriv =
+ container_of(peer_vm, struct amdgpu_fpriv, vm);
+
+ ret = amdgpu_vm_bo_update(adev, fpriv->prt_va, false);
+ if (ret) {
+ dev_dbg(adev->dev,
+ "Memory eviction: handle PRT moved failed, pid %8d. Try again.\n",
+ pid_nr(process_info->pid));
+ goto validate_map_fail;
+ }
+
+ ret = amdgpu_vm_handle_moved(adev, peer_vm, &exec.ticket);
+ if (ret) {
+ dev_dbg(adev->dev,
+ "Memory eviction: handle moved failed, pid %8d. Try again.\n",
+ pid_nr(process_info->pid));
+ goto validate_map_fail;
+ }
+ }
+
/* Update page directories */
ret = process_update_pds(process_info, &sync_obj);
if (ret) {
@@ -2214,44 +3039,214 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence **ef)
goto validate_map_fail;
}
+ /* Sync with fences on all the page tables. They implicitly depend on any
+ * move fences from amdgpu_vm_handle_moved above.
+ */
+ ret = process_sync_pds_resv(process_info, &sync_obj);
+ if (ret) {
+ pr_debug("Memory eviction: Failed to sync to PD BO moving fence. Try again\n");
+ goto validate_map_fail;
+ }
+
/* Wait for validate and PT updates to finish */
amdgpu_sync_wait(&sync_obj, false);
- /* Release old eviction fence and create new one, because fence only
- * goes from unsignaled to signaled, fence cannot be reused.
- * Use context and mm from the old fence.
+ /* The old eviction fence may be unsignaled if restore happens
+ * after a GPU reset or suspend/resume. Keep the old fence in that
+ * case. Otherwise release the old eviction fence and create new
+ * one, because fence only goes from unsignaled to signaled once
+ * and cannot be reused. Use context and mm from the old fence.
+ *
+ * If an old eviction fence signals after this check, that's OK.
+ * Anyone signaling an eviction fence must stop the queues first
+ * and schedule another restore worker.
*/
- new_fence = amdgpu_amdkfd_fence_create(
+ if (dma_fence_is_signaled(&process_info->eviction_fence->base)) {
+ struct amdgpu_amdkfd_fence *new_fence =
+ amdgpu_amdkfd_fence_create(
process_info->eviction_fence->base.context,
- process_info->eviction_fence->mm);
- if (!new_fence) {
- pr_err("Failed to create eviction fence\n");
- ret = -ENOMEM;
- goto validate_map_fail;
+ process_info->eviction_fence->mm,
+ NULL);
+
+ if (!new_fence) {
+ pr_err("Failed to create eviction fence\n");
+ ret = -ENOMEM;
+ goto validate_map_fail;
+ }
+ dma_fence_put(&process_info->eviction_fence->base);
+ process_info->eviction_fence = new_fence;
+ replace_eviction_fence(ef, dma_fence_get(&new_fence->base));
+ } else {
+ WARN_ONCE(*ef != &process_info->eviction_fence->base,
+ "KFD eviction fence doesn't match KGD process_info");
}
- dma_fence_put(&process_info->eviction_fence->base);
- process_info->eviction_fence = new_fence;
- *ef = dma_fence_get(&new_fence->base);
- /* Attach new eviction fence to all BOs */
- list_for_each_entry(mem, &process_info->kfd_bo_list,
- validate_list.head)
- amdgpu_bo_fence(mem->bo,
- &process_info->eviction_fence->base, true);
+ /* Attach new eviction fence to all BOs except pinned ones */
+ list_for_each_entry(mem, &process_info->kfd_bo_list, validate_list) {
+ if (mem->bo->tbo.pin_count)
+ continue;
- /* Attach eviction fence to PD / PT BOs */
+ dma_resv_add_fence(mem->bo->tbo.base.resv,
+ &process_info->eviction_fence->base,
+ DMA_RESV_USAGE_BOOKKEEP);
+ }
+ /* Attach eviction fence to PD / PT BOs and DMABuf imports */
list_for_each_entry(peer_vm, &process_info->vm_list_head,
vm_list_node) {
- struct amdgpu_bo *bo = peer_vm->root.base.bo;
+ struct amdgpu_bo *bo = peer_vm->root.bo;
- amdgpu_bo_fence(bo, &process_info->eviction_fence->base, true);
+ dma_resv_add_fence(bo->tbo.base.resv,
+ &process_info->eviction_fence->base,
+ DMA_RESV_USAGE_BOOKKEEP);
}
validate_map_fail:
- ttm_eu_backoff_reservation(&ctx.ticket, &ctx.list);
amdgpu_sync_free(&sync_obj);
ttm_reserve_fail:
+ drm_exec_fini(&exec);
mutex_unlock(&process_info->lock);
- kfree(pd_bo_list);
return ret;
}
+
+int amdgpu_amdkfd_add_gws_to_process(void *info, void *gws, struct kgd_mem **mem)
+{
+ struct amdkfd_process_info *process_info = (struct amdkfd_process_info *)info;
+ struct amdgpu_bo *gws_bo = (struct amdgpu_bo *)gws;
+ int ret;
+
+ if (!info || !gws)
+ return -EINVAL;
+
+ *mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL);
+ if (!*mem)
+ return -ENOMEM;
+
+ mutex_init(&(*mem)->lock);
+ INIT_LIST_HEAD(&(*mem)->attachments);
+ (*mem)->bo = amdgpu_bo_ref(gws_bo);
+ (*mem)->domain = AMDGPU_GEM_DOMAIN_GWS;
+ (*mem)->process_info = process_info;
+ add_kgd_mem_to_kfd_bo_list(*mem, process_info, false);
+ amdgpu_sync_create(&(*mem)->sync);
+
+
+ /* Validate gws bo the first time it is added to process */
+ mutex_lock(&(*mem)->process_info->lock);
+ ret = amdgpu_bo_reserve(gws_bo, false);
+ if (unlikely(ret)) {
+ pr_err("Reserve gws bo failed %d\n", ret);
+ goto bo_reservation_failure;
+ }
+
+ ret = amdgpu_amdkfd_bo_validate(gws_bo, AMDGPU_GEM_DOMAIN_GWS, true);
+ if (ret) {
+ pr_err("GWS BO validate failed %d\n", ret);
+ goto bo_validation_failure;
+ }
+ /* GWS resource is shared b/t amdgpu and amdkfd
+ * Add process eviction fence to bo so they can
+ * evict each other.
+ */
+ ret = dma_resv_reserve_fences(gws_bo->tbo.base.resv, 1);
+ if (ret)
+ goto reserve_shared_fail;
+ dma_resv_add_fence(gws_bo->tbo.base.resv,
+ &process_info->eviction_fence->base,
+ DMA_RESV_USAGE_BOOKKEEP);
+ amdgpu_bo_unreserve(gws_bo);
+ mutex_unlock(&(*mem)->process_info->lock);
+
+ return ret;
+
+reserve_shared_fail:
+bo_validation_failure:
+ amdgpu_bo_unreserve(gws_bo);
+bo_reservation_failure:
+ mutex_unlock(&(*mem)->process_info->lock);
+ amdgpu_sync_free(&(*mem)->sync);
+ remove_kgd_mem_from_kfd_bo_list(*mem, process_info);
+ amdgpu_bo_unref(&gws_bo);
+ mutex_destroy(&(*mem)->lock);
+ kfree(*mem);
+ *mem = NULL;
+ return ret;
+}
+
+int amdgpu_amdkfd_remove_gws_from_process(void *info, void *mem)
+{
+ int ret;
+ struct amdkfd_process_info *process_info = (struct amdkfd_process_info *)info;
+ struct kgd_mem *kgd_mem = (struct kgd_mem *)mem;
+ struct amdgpu_bo *gws_bo = kgd_mem->bo;
+
+ /* Remove BO from process's validate list so restore worker won't touch
+ * it anymore
+ */
+ remove_kgd_mem_from_kfd_bo_list(kgd_mem, process_info);
+
+ ret = amdgpu_bo_reserve(gws_bo, false);
+ if (unlikely(ret)) {
+ pr_err("Reserve gws bo failed %d\n", ret);
+ //TODO add BO back to validate_list?
+ return ret;
+ }
+ amdgpu_amdkfd_remove_eviction_fence(gws_bo,
+ process_info->eviction_fence);
+ amdgpu_bo_unreserve(gws_bo);
+ amdgpu_sync_free(&kgd_mem->sync);
+ amdgpu_bo_unref(&gws_bo);
+ mutex_destroy(&kgd_mem->lock);
+ kfree(mem);
+ return 0;
+}
+
+/* Returns GPU-specific tiling mode information */
+int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
+ struct tile_config *config)
+{
+ config->gb_addr_config = adev->gfx.config.gb_addr_config;
+ config->tile_config_ptr = adev->gfx.config.tile_mode_array;
+ config->num_tile_configs =
+ ARRAY_SIZE(adev->gfx.config.tile_mode_array);
+ config->macro_tile_config_ptr =
+ adev->gfx.config.macrotile_mode_array;
+ config->num_macro_tile_configs =
+ ARRAY_SIZE(adev->gfx.config.macrotile_mode_array);
+
+ /* Those values are not set from GFX9 onwards */
+ config->num_banks = adev->gfx.config.num_banks;
+ config->num_ranks = adev->gfx.config.num_ranks;
+
+ return 0;
+}
+
+bool amdgpu_amdkfd_bo_mapped_to_dev(void *drm_priv, struct kgd_mem *mem)
+{
+ struct amdgpu_vm *vm = drm_priv_to_vm(drm_priv);
+ struct kfd_mem_attachment *entry;
+
+ list_for_each_entry(entry, &mem->attachments, list) {
+ if (entry->is_mapped && entry->bo_va->base.vm == vm)
+ return true;
+ }
+ return false;
+}
+
+#if defined(CONFIG_DEBUG_FS)
+
+int kfd_debugfs_kfd_mem_limits(struct seq_file *m, void *data)
+{
+
+ spin_lock(&kfd_mem_limit.mem_limit_lock);
+ seq_printf(m, "System mem used %lldM out of %lluM\n",
+ (kfd_mem_limit.system_mem_used >> 20),
+ (kfd_mem_limit.max_system_mem_limit >> 20));
+ seq_printf(m, "TTM mem used %lldM out of %lluM\n",
+ (kfd_mem_limit.ttm_mem_used >> 20),
+ (kfd_mem_limit.max_ttm_mem_limit >> 20));
+ spin_unlock(&kfd_mem_limit.mem_limit_lock);
+
+ return 0;
+}
+
+#endif