summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
diff options
context:
space:
mode:
authorPhilip Yang <Philip.Yang@amd.com>2023-09-13 10:10:23 -0400
committerAlex Deucher <alexander.deucher@amd.com>2023-09-26 17:00:23 -0400
commiteb3c357bcb286e89386e89302061fe717fe4e562 (patch)
treedb5ec94fe99f97df6e9fc8b1263eb2d0eb284d26 /drivers/gpu/drm/amd/amdkfd/kfd_svm.c
parent21e43386aec839faf00b71b4684cc72eb649a0f5 (diff)
drm/amdkfd: Handle errors from svm validate and map
If new range is splited to multiple pranges with max_svm_range_pages alignment and added to update_list, svm validate and map should keep going after error to make sure prange->mapped_to_gpu flag is up to date for the whole range. svm validate and map update set prange->mapped_to_gpu after mapping to GPUs successfully, otherwise clear prange->mapped_to_gpu flag (for update mapping case) instead of setting error flag, we can remove the redundant error flag to simpliy code. Refactor to remove goto and update prange->mapped_to_gpu flag inside svm_range_lock, to guarant we always evict queues or unmap from GPUs if there are invalid ranges. After svm validate and map return error -EAGIN, the caller retry will update the mapping for the whole range again. Fixes: c22b04407097 ("drm/amdkfd: flag added to handle errors from svm validate and map") Signed-off-by: Philip Yang <Philip.Yang@amd.com> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> Tested-by: James Zhu <james.zhu@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_svm.c')
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_svm.c82
1 files changed, 39 insertions, 43 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 56a2afdfd3d4..58cca80589ae 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -829,7 +829,7 @@ svm_range_is_same_attrs(struct kfd_process *p, struct svm_range *prange,
}
}
- return !prange->is_error_flag;
+ return true;
}
/**
@@ -1679,71 +1679,66 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
start = prange->start << PAGE_SHIFT;
end = (prange->last + 1) << PAGE_SHIFT;
- for (addr = start; addr < end && !r; ) {
+ for (addr = start; !r && addr < end; ) {
struct hmm_range *hmm_range;
struct vm_area_struct *vma;
- unsigned long next;
+ unsigned long next = 0;
unsigned long offset;
unsigned long npages;
bool readonly;
vma = vma_lookup(mm, addr);
- if (!vma) {
+ if (vma) {
+ readonly = !(vma->vm_flags & VM_WRITE);
+
+ next = min(vma->vm_end, end);
+ npages = (next - addr) >> PAGE_SHIFT;
+ WRITE_ONCE(p->svms.faulting_task, current);
+ r = amdgpu_hmm_range_get_pages(&prange->notifier, addr, npages,
+ readonly, owner, NULL,
+ &hmm_range);
+ WRITE_ONCE(p->svms.faulting_task, NULL);
+ if (r) {
+ pr_debug("failed %d to get svm range pages\n", r);
+ if (r == -EBUSY)
+ r = -EAGAIN;
+ }
+ } else {
r = -EFAULT;
- goto unreserve_out;
- }
- readonly = !(vma->vm_flags & VM_WRITE);
-
- next = min(vma->vm_end, end);
- npages = (next - addr) >> PAGE_SHIFT;
- WRITE_ONCE(p->svms.faulting_task, current);
- r = amdgpu_hmm_range_get_pages(&prange->notifier, addr, npages,
- readonly, owner, NULL,
- &hmm_range);
- WRITE_ONCE(p->svms.faulting_task, NULL);
- if (r) {
- pr_debug("failed %d to get svm range pages\n", r);
- if (r == -EBUSY)
- r = -EAGAIN;
- goto unreserve_out;
}
- offset = (addr - start) >> PAGE_SHIFT;
- r = svm_range_dma_map(prange, ctx->bitmap, offset, npages,
- hmm_range->hmm_pfns);
- if (r) {
- pr_debug("failed %d to dma map range\n", r);
- goto unreserve_out;
+ if (!r) {
+ offset = (addr - start) >> PAGE_SHIFT;
+ r = svm_range_dma_map(prange, ctx->bitmap, offset, npages,
+ hmm_range->hmm_pfns);
+ if (r)
+ pr_debug("failed %d to dma map range\n", r);
}
svm_range_lock(prange);
- if (amdgpu_hmm_range_get_pages_done(hmm_range)) {
+ if (!r && amdgpu_hmm_range_get_pages_done(hmm_range)) {
pr_debug("hmm update the range, need validate again\n");
r = -EAGAIN;
- goto unlock_out;
}
- if (!list_empty(&prange->child_list)) {
+
+ if (!r && !list_empty(&prange->child_list)) {
pr_debug("range split by unmap in parallel, validate again\n");
r = -EAGAIN;
- goto unlock_out;
}
- r = svm_range_map_to_gpus(prange, offset, npages, readonly,
- ctx->bitmap, wait, flush_tlb);
+ if (!r)
+ r = svm_range_map_to_gpus(prange, offset, npages, readonly,
+ ctx->bitmap, wait, flush_tlb);
+
+ if (!r && next == end)
+ prange->mapped_to_gpu = true;
-unlock_out:
svm_range_unlock(prange);
addr = next;
}
- if (addr == end)
- prange->mapped_to_gpu = true;
-
-unreserve_out:
svm_range_unreserve_bos(ctx);
-
- prange->is_error_flag = !!r;
if (!r)
prange->validate_timestamp = ktime_get_boottime();
@@ -2112,7 +2107,8 @@ svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size,
next = interval_tree_iter_next(node, start, last);
next_start = min(node->last, last) + 1;
- if (svm_range_is_same_attrs(p, prange, nattr, attrs)) {
+ if (svm_range_is_same_attrs(p, prange, nattr, attrs) &&
+ prange->mapped_to_gpu) {
/* nothing to do */
} else if (node->start < start || node->last > last) {
/* node intersects the update range and its attributes
@@ -3525,7 +3521,7 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm,
struct svm_range *next;
bool update_mapping = false;
bool flush_tlb;
- int r = 0;
+ int r, ret = 0;
pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] pages 0x%llx\n",
p->pasid, &p->svms, start, start + size - 1, size);
@@ -3613,7 +3609,7 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm,
out_unlock_range:
mutex_unlock(&prange->migrate_mutex);
if (r)
- break;
+ ret = r;
}
dynamic_svm_range_dump(svms);
@@ -3626,7 +3622,7 @@ out:
pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] done, r=%d\n", p->pasid,
&p->svms, start, start + size - 1, r);
- return r;
+ return ret ? ret : r;
}
static int