From 50661eb1a2c88c0e50cb234cb117a7fbfe03b3e0 Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Wed, 3 Jan 2024 18:13:22 -0500 Subject: drm/amdgpu: Auto-validate DMABuf imports in compute VMs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DMABuf imports in compute VMs are not wrapped in a kgd_mem object on the process_info->kfd_bo_list. There is no explicit KFD API call to validate them or add eviction fences to them. This patch automatically validates and fences dymanic DMABuf imports when they are added to a compute VM. Revalidation after evictions is handled in the VM code. v2: * Renamed amdgpu_vm_validate_evicted_bos to amdgpu_vm_validate * Eliminated evicted_user state, use evicted state for VM BOs and user BOs * Fixed and simplified amdgpu_vm_fence_imports, depends on reserved BOs * Moved dma_resv_reserve_fences for amdgpu_vm_fence_imports into amdgpu_vm_validate, outside the vm->status_lock * Added dummy version of amdgpu_amdkfd_bo_validate_and_fence for builds without KFD v4: Eliminate amdgpu_vm_fence_imports. It's not needed because the reservation with its fences is shared with the export, as long as all imports are from KFD, with the exports already reserved, validated and fenced by the KFD restore worker. v5: Reintroduced separate evicted_user state to simplify the state machine and CS error handling when amdgpu_vm_validate is called without a ticket. Signed-off-by: Felix Kuehling Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 4740dd65b99d..67d12f86bfe4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -288,9 +288,12 @@ struct amdgpu_vm { /* Lock to protect vm_bo add/del/move on all lists of vm */ spinlock_t status_lock; - /* BOs who needs a validation */ + /* Per-VM and PT BOs who needs a validation */ struct list_head evicted; + /* BOs for user mode queues that need a validation */ + struct list_head evicted_user; + /* PT BOs which relocated and their parent need an update */ struct list_head relocated; @@ -434,9 +437,10 @@ int amdgpu_vm_lock_pd(struct amdgpu_vm *vm, struct drm_exec *exec, unsigned int num_fences); bool amdgpu_vm_ready(struct amdgpu_vm *vm); uint64_t amdgpu_vm_generation(struct amdgpu_device *adev, struct amdgpu_vm *vm); -int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm, - int (*callback)(void *p, struct amdgpu_bo *bo), - void *param); +int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm, + struct ww_acquire_ctx *ticket, + int (*callback)(void *p, struct amdgpu_bo *bo), + void *param); int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, bool need_pipe_sync); int amdgpu_vm_update_pdes(struct amdgpu_device *adev, struct amdgpu_vm *vm, bool immediate); -- cgit From 00a11f977beb752186221679db2265a69118a5a7 Mon Sep 17 00:00:00 2001 From: Arunpravin Paneer Selvam Date: Thu, 11 Jan 2024 23:21:13 -0800 Subject: drm/amdgpu: Enable seq64 manager and fix bugs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Enable the seq64 mapping sequence. - Fix wflinfo va conflict and other bugs. v1: - The seq64 area needs to be included in the AMDGPU_VA_RESERVED_SIZE otherwise the areas will conflict with user space allocations (Alex) - It needs to be mapped read only in the user VM (Alex) v2: - Instead of just one define for TOP/BOTTOM reserved space separate them into two (Christian) - Fix the CPU and VA calculations and while at it also cleanup error handling and kerneldoc (Christian) Signed-off-by: Christian König Signed-off-by: Alex Deucher Signed-off-by: Arunpravin Paneer Selvam Reviewed-by: Christian König --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 67d12f86bfe4..666698a57192 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -136,7 +136,11 @@ struct amdgpu_mem_stats; #define AMDGPU_IS_MMHUB1(x) ((x) >= AMDGPU_MMHUB1_START && (x) < AMDGPU_MAX_VMHUBS) /* Reserve 2MB at top/bottom of address space for kernel use */ -#define AMDGPU_VA_RESERVED_SIZE (2ULL << 20) +#define AMDGPU_VA_RESERVED_CSA_SIZE (2ULL << 20) +#define AMDGPU_VA_RESERVED_SEQ64_SIZE (2ULL << 20) +#define AMDGPU_VA_RESERVED_BOTTOM (2ULL << 20) +#define AMDGPU_VA_RESERVED_TOP (AMDGPU_VA_RESERVED_SEQ64_SIZE + \ + AMDGPU_VA_RESERVED_CSA_SIZE) /* See vm_update_mode */ #define AMDGPU_VM_USE_CPU_FOR_GFX (1 << 0) -- cgit From efe0f34c2bd037a0b01465323d52a7bbd8b5e888 Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Tue, 30 Jan 2024 14:34:41 -0500 Subject: drm/amdgpu: Reduce VA_RESERVED_BOTTOM to 64KB MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The reservation is there to catch NULL pointer dereferences from the GPU. Reduce the size to 64KB to make sure that shared virtual address programming models can map all CPU-accessible virtual addresses for GPU access. This is also the default for CPU virtual address mappings as seen in /proc/sys/vm/mmap_min_addr. Reviewed-by: Christian König Signed-off-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 666698a57192..2c4053b29bb3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -135,10 +135,10 @@ struct amdgpu_mem_stats; #define AMDGPU_IS_MMHUB0(x) ((x) >= AMDGPU_MMHUB0_START && (x) < AMDGPU_MMHUB1_START) #define AMDGPU_IS_MMHUB1(x) ((x) >= AMDGPU_MMHUB1_START && (x) < AMDGPU_MAX_VMHUBS) -/* Reserve 2MB at top/bottom of address space for kernel use */ +/* Reserve space at top/bottom of address space for kernel use */ #define AMDGPU_VA_RESERVED_CSA_SIZE (2ULL << 20) #define AMDGPU_VA_RESERVED_SEQ64_SIZE (2ULL << 20) -#define AMDGPU_VA_RESERVED_BOTTOM (2ULL << 20) +#define AMDGPU_VA_RESERVED_BOTTOM (1ULL << 16) #define AMDGPU_VA_RESERVED_TOP (AMDGPU_VA_RESERVED_SEQ64_SIZE + \ AMDGPU_VA_RESERVED_CSA_SIZE) -- cgit From 34a1de0f79352086884553f78db271f957a98583 Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Thu, 25 Jan 2024 14:12:43 -0500 Subject: drm/amdkfd: Relocate TBA/TMA to opposite side of VM hole MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The TBA and TMA, along with an unused IB allocation, reside at low addresses in the VM address space. A stray VM fault which hits these pages must be serviced by making their page table entries invalid. The scheduler depends upon these pages being resident and fails, preventing a debugger from inspecting the failure state. By relocating these pages above 47 bits in the VM address space they can only be reached when bits [63:48] are set to 1. This makes it much less likely for a misbehaving program to generate accesses to them. The current placement at VA (PAGE_SIZE*2) is readily hit by a NULL access with a small offset. v2: - Move it to the reserved space to avoid concflicts with Mesa - Add macros to make reserved space management easier v3: - Move VM max PFN calculation into AMDGPU_VA_RESERVED macros Cc: Arunpravin Paneer Selvam Cc: Christian Koenig Reviewed-by: Christian König Signed-off-by: Jay Cornwall Signed-off-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 2c4053b29bb3..42f6ddec50c1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -137,9 +137,18 @@ struct amdgpu_mem_stats; /* Reserve space at top/bottom of address space for kernel use */ #define AMDGPU_VA_RESERVED_CSA_SIZE (2ULL << 20) +#define AMDGPU_VA_RESERVED_CSA_START(adev) (((adev)->vm_manager.max_pfn \ + << AMDGPU_GPU_PAGE_SHIFT) \ + - AMDGPU_VA_RESERVED_CSA_SIZE) #define AMDGPU_VA_RESERVED_SEQ64_SIZE (2ULL << 20) +#define AMDGPU_VA_RESERVED_SEQ64_START(adev) (AMDGPU_VA_RESERVED_CSA_START(adev) \ + - AMDGPU_VA_RESERVED_SEQ64_SIZE) +#define AMDGPU_VA_RESERVED_TRAP_SIZE (2ULL << 12) +#define AMDGPU_VA_RESERVED_TRAP_START(adev) (AMDGPU_VA_RESERVED_SEQ64_START(adev) \ + - AMDGPU_VA_RESERVED_TRAP_SIZE) #define AMDGPU_VA_RESERVED_BOTTOM (1ULL << 16) -#define AMDGPU_VA_RESERVED_TOP (AMDGPU_VA_RESERVED_SEQ64_SIZE + \ +#define AMDGPU_VA_RESERVED_TOP (AMDGPU_VA_RESERVED_TRAP_SIZE + \ + AMDGPU_VA_RESERVED_SEQ64_SIZE + \ AMDGPU_VA_RESERVED_CSA_SIZE) /* See vm_update_mode */ -- cgit From feb13f52c8547a8198045077d6aa9c3f2400ba11 Mon Sep 17 00:00:00 2001 From: Jesse Zhang Date: Thu, 29 Feb 2024 14:00:14 +0800 Subject: Revert "drm/amdgpu: remove vm sanity check from amdgpu_vm_make_compute" for Raven MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix the issue: "amdgpu: Failed to create process VM object". [Why]when amdgpu initialized, seq64 do mampping and update bo mapping in vm page table. But when clifo run. It also initializes a vm for a process device through the function kfd_process_device_init_vm and ensure the root PD is clean through the function amdgpu_vm_pt_is_root_clean. So they have a conflict, and clinfo always failed. v1: - remove all the pte_supports_ats stuff from the amdgpu_vm code (Felix) Signed-off-by: Jesse Zhang Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 42f6ddec50c1..9f6b5e1ccf34 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -357,9 +357,6 @@ struct amdgpu_vm { /* Functions to use for VM table updates */ const struct amdgpu_vm_update_funcs *update_funcs; - /* Flag to indicate ATS support from PTE for GFX9 */ - bool pte_support_ats; - /* Up to 128 pending retry page faults */ DECLARE_KFIFO(faults, u64, 128); -- cgit From b8f67b9ddf4f8fe6dd536590712b5912ad78f99c Mon Sep 17 00:00:00 2001 From: Shashank Sharma Date: Thu, 18 Jan 2024 20:15:42 +0100 Subject: drm/amdgpu: change vm->task_info handling This patch changes the handling and lifecycle of vm->task_info object. The major changes are: - vm->task_info is a dynamically allocated ptr now, and its uasge is reference counted. - introducing two new helper funcs for task_info lifecycle management - amdgpu_vm_get_task_info: reference counts up task_info before returning this info - amdgpu_vm_put_task_info: reference counts down task_info - last put to task_info() frees task_info from the vm. This patch also does logistical changes required for existing usage of vm->task_info. V2: Do not block all the prints when task_info not found (Felix) V3: Fixed review comments from Felix - Fix wrong indentation - No debug message for -ENOMEM - Add NULL check for task_info - Do not duplicate the debug messages (ti vs no ti) - Get first reference of task_info in vm_init(), put last in vm_fini() V4: Fixed review comments from Felix - fix double reference increment in create_task_info - change amdgpu_vm_get_task_info_pasid - additional changes in amdgpu_gem.c while porting Cc: Christian Koenig Cc: Alex Deucher Cc: Felix Kuehling Reviewed-by: Felix Kuehling Signed-off-by: Shashank Sharma Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 9f6b5e1ccf34..7f95039bb37d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -203,10 +203,11 @@ struct amdgpu_vm_pte_funcs { }; struct amdgpu_task_info { - char process_name[TASK_COMM_LEN]; - char task_name[TASK_COMM_LEN]; - pid_t pid; - pid_t tgid; + char process_name[TASK_COMM_LEN]; + char task_name[TASK_COMM_LEN]; + pid_t pid; + pid_t tgid; + struct kref refcount; }; /** @@ -370,7 +371,7 @@ struct amdgpu_vm { uint64_t pd_phys_addr; /* Some basic info about the task */ - struct amdgpu_task_info task_info; + struct amdgpu_task_info *task_info; /* Store positions of group of BOs */ struct ttm_lru_bulk_move lru_bulk_move; @@ -511,8 +512,14 @@ bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring, struct amdgpu_job *job); void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev); -void amdgpu_vm_get_task_info(struct amdgpu_device *adev, u32 pasid, - struct amdgpu_task_info *task_info); +struct amdgpu_task_info * +amdgpu_vm_get_task_info_pasid(struct amdgpu_device *adev, u32 pasid); + +struct amdgpu_task_info * +amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm); + +void amdgpu_vm_put_task_info(struct amdgpu_task_info *task_info); + bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, u32 vmid, u32 node_id, uint64_t addr, bool write_fault); -- cgit From bb8863cc9d067c44e751579881048dca0403133c Mon Sep 17 00:00:00 2001 From: Jesse Zhang Date: Tue, 5 Mar 2024 10:22:57 +0800 Subject: drm/amdgpu: remove unused code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the unused function - amdgpu_vm_pt_is_root_clean and remove the impossible condition v1: entries == 0 is not possible any more, so this condition could probably be removed (Felix) Signed-off-by: Jesse Zhang Suggested-by:Felix Kuehling Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 7f95039bb37d..047ec1930d12 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -537,8 +537,6 @@ int amdgpu_vm_pt_create(struct amdgpu_device *adev, struct amdgpu_vm *vm, int level, bool immediate, struct amdgpu_bo_vm **vmbo, int32_t xcp_id); void amdgpu_vm_pt_free_root(struct amdgpu_device *adev, struct amdgpu_vm *vm); -bool amdgpu_vm_pt_is_root_clean(struct amdgpu_device *adev, - struct amdgpu_vm *vm); int amdgpu_vm_pde_update(struct amdgpu_vm_update_params *params, struct amdgpu_vm_bo_base *entry); -- cgit