diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_events.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_events.c | 91 |
1 files changed, 73 insertions, 18 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index 0f58be65132f..5a190dd6be4e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -31,6 +31,7 @@ #include <linux/memory.h> #include "kfd_priv.h" #include "kfd_events.h" +#include "kfd_device_queue_manager.h" #include <linux/device.h> /* @@ -726,7 +727,7 @@ void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id, * to process context, kfd_process could attempt to exit while we are * running so the lookup function increments the process ref count. */ - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL); if (!p) return; /* Presumably process exited. */ @@ -880,6 +881,10 @@ static int copy_signaled_event_data(uint32_t num_events, dst = &data[i].memory_exception_data; src = &event->memory_exception_data; size = sizeof(struct kfd_hsa_memory_exception_data); + } else if (event->type == KFD_EVENT_TYPE_HW_EXCEPTION) { + dst = &data[i].memory_exception_data; + src = &event->hw_exception_data; + size = sizeof(struct kfd_hsa_hw_exception_data); } else if (event->type == KFD_EVENT_TYPE_SIGNAL && waiter->event_age_enabled) { dst = &data[i].signal_event_data.last_event_age; @@ -1123,8 +1128,8 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p, if (type == KFD_EVENT_TYPE_MEMORY) { dev_warn(kfd_device, - "Sending SIGSEGV to process %d (pasid 0x%x)", - p->lead_thread->pid, p->pasid); + "Sending SIGSEGV to process pid %d", + p->lead_thread->pid); send_sig(SIGSEGV, p->lead_thread, 0); } @@ -1132,13 +1137,13 @@ static void lookup_events_by_type_and_signal(struct kfd_process *p, if (send_signal) { if (send_sigterm) { dev_warn(kfd_device, - "Sending SIGTERM to process %d (pasid 0x%x)", - p->lead_thread->pid, p->pasid); + "Sending SIGTERM to process pid %d", + p->lead_thread->pid); send_sig(SIGTERM, p->lead_thread, 0); } else { dev_err(kfd_device, - "Process %d (pasid 0x%x) got unhandled exception", - p->lead_thread->pid, p->pasid); + "Process pid %d got unhandled exception", + p->lead_thread->pid); } } @@ -1152,7 +1157,7 @@ void kfd_signal_hw_exception_event(u32 pasid) * to process context, kfd_process could attempt to exit while we are * running so the lookup function increments the process ref count. */ - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL); if (!p) return; /* Presumably process exited. */ @@ -1161,22 +1166,39 @@ void kfd_signal_hw_exception_event(u32 pasid) kfd_unref_process(p); } -void kfd_signal_vm_fault_event(struct kfd_node *dev, u32 pasid, +void kfd_signal_vm_fault_event_with_userptr(struct kfd_process *p, uint64_t gpu_va) +{ + struct kfd_process_device *pdd; + struct kfd_hsa_memory_exception_data exception_data; + int i; + + memset(&exception_data, 0, sizeof(exception_data)); + exception_data.va = gpu_va; + exception_data.failure.NotPresent = 1; + + // Send VM seg fault to all kfd process device + for (i = 0; i < p->n_pdds; i++) { + pdd = p->pdds[i]; + exception_data.gpu_id = pdd->user_gpu_id; + kfd_evict_process_device(pdd); + kfd_signal_vm_fault_event(pdd, NULL, &exception_data); + } +} + +void kfd_signal_vm_fault_event(struct kfd_process_device *pdd, struct kfd_vm_fault_info *info, struct kfd_hsa_memory_exception_data *data) { struct kfd_event *ev; uint32_t id; - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + struct kfd_process *p = pdd->process; struct kfd_hsa_memory_exception_data memory_exception_data; int user_gpu_id; - if (!p) - return; /* Presumably process exited. */ - - user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id); + user_gpu_id = kfd_process_get_user_gpu_id(p, pdd->dev->id); if (unlikely(user_gpu_id == -EINVAL)) { - WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id); + WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", + pdd->dev->id); return; } @@ -1213,7 +1235,6 @@ void kfd_signal_vm_fault_event(struct kfd_node *dev, u32 pasid, } rcu_read_unlock(); - kfd_unref_process(p); } void kfd_signal_reset_event(struct kfd_node *dev) @@ -1240,12 +1261,41 @@ void kfd_signal_reset_event(struct kfd_node *dev) idx = srcu_read_lock(&kfd_processes_srcu); hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { int user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id); + struct kfd_process_device *pdd = kfd_get_process_device_data(dev, p); if (unlikely(user_gpu_id == -EINVAL)) { WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id); continue; } + if (unlikely(!pdd)) { + WARN_ONCE(1, "Could not get device data from process pid:%d\n", + p->lead_thread->pid); + continue; + } + + if (dev->dqm->detect_hang_count && !pdd->has_reset_queue) + continue; + + if (dev->dqm->detect_hang_count) { + struct amdgpu_task_info *ti; + struct amdgpu_fpriv *drv_priv; + + if (unlikely(amdgpu_file_to_fpriv(pdd->drm_file, &drv_priv))) { + WARN_ONCE(1, "Could not get vm for device %x from pid:%d\n", + dev->id, p->lead_thread->pid); + continue; + } + + ti = amdgpu_vm_get_task_info_vm(&drv_priv->vm); + if (ti) { + dev_err(dev->adev->dev, + "Queues reset on process %s tid %d thread %s pid %d\n", + ti->process_name, ti->tgid, ti->task.comm, ti->task.pid); + amdgpu_vm_put_task_info(ti); + } + } + rcu_read_lock(); id = KFD_FIRST_NONSIGNAL_EVENT_ID; @@ -1274,19 +1324,22 @@ void kfd_signal_reset_event(struct kfd_node *dev) void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid) { - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL); struct kfd_hsa_memory_exception_data memory_exception_data; struct kfd_hsa_hw_exception_data hw_exception_data; struct kfd_event *ev; uint32_t id = KFD_FIRST_NONSIGNAL_EVENT_ID; int user_gpu_id; - if (!p) + if (!p) { + dev_warn(dev->adev->dev, "Not find process with pasid:%d\n", pasid); return; /* Presumably process exited. */ + } user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id); if (unlikely(user_gpu_id == -EINVAL)) { WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id); + kfd_unref_process(p); return; } @@ -1318,6 +1371,8 @@ void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid) } } + dev_warn(dev->adev->dev, "Send SIGBUS to process %s(pasid:%d)\n", + p->lead_thread->comm, pasid); rcu_read_unlock(); /* user application will handle SIGBUS signal */ |
