diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c | 231 |
1 files changed, 95 insertions, 136 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c index 8e0d0356e810..3e1ad8974797 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c @@ -129,63 +129,6 @@ enum SQ_INTERRUPT_ERROR_TYPE { KFD_DEBUG_CP_BAD_OP_ECODE_MASK) \ >> KFD_DEBUG_CP_BAD_OP_ECODE_SHIFT) -static void event_interrupt_poison_consumption(struct kfd_node *dev, - uint16_t pasid, uint16_t client_id) -{ - enum amdgpu_ras_block block = 0; - int old_poison, ret = -EINVAL; - uint32_t reset = 0; - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); - - if (!p) - return; - - /* all queues of a process will be unmapped in one time */ - old_poison = atomic_cmpxchg(&p->poison, 0, 1); - kfd_unref_process(p); - if (old_poison) - return; - - switch (client_id) { - case SOC15_IH_CLIENTID_SE0SH: - case SOC15_IH_CLIENTID_SE1SH: - case SOC15_IH_CLIENTID_SE2SH: - case SOC15_IH_CLIENTID_SE3SH: - case SOC15_IH_CLIENTID_UTCL2: - ret = kfd_dqm_evict_pasid(dev->dqm, pasid); - block = AMDGPU_RAS_BLOCK__GFX; - if (ret) - reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; - break; - case SOC15_IH_CLIENTID_SDMA0: - case SOC15_IH_CLIENTID_SDMA1: - case SOC15_IH_CLIENTID_SDMA2: - case SOC15_IH_CLIENTID_SDMA3: - case SOC15_IH_CLIENTID_SDMA4: - block = AMDGPU_RAS_BLOCK__SDMA; - reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; - break; - default: - break; - } - - kfd_signal_poison_consumed_event(dev, pasid); - - /* resetting queue passes, do page retirement without gpu reset - * resetting queue fails, fallback to gpu reset solution - */ - if (!ret) - dev_warn(dev->adev->dev, - "RAS poison consumption, unmap queue flow succeeded: client id %d\n", - client_id); - else - dev_warn(dev->adev->dev, - "RAS poison consumption, fall back to gpu reset flow: client id %d\n", - client_id); - - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset); -} - static bool event_interrupt_isr_v10(struct kfd_node *dev, const uint32_t *ih_ring_entry, uint32_t *patched_ihre, @@ -225,14 +168,14 @@ static bool event_interrupt_isr_v10(struct kfd_node *dev, client_id != SOC15_IH_CLIENTID_SE3SH) return false; - pr_debug("client id 0x%x, source id %d, vmid %d, pasid 0x%x. raw data:\n", - client_id, source_id, vmid, pasid); - pr_debug("%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n", - data[0], data[1], data[2], data[3], - data[4], data[5], data[6], data[7]); + dev_dbg(dev->adev->dev, + "client id 0x%x, source id %d, vmid %d, pasid 0x%x. raw data:\n", + client_id, source_id, vmid, pasid); + dev_dbg(dev->adev->dev, "%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n", + data[0], data[1], data[2], data[3], data[4], data[5], data[6], + data[7]); - /* If there is no valid PASID, it's likely a bug */ - if (WARN_ONCE(pasid == 0, "Bug: No PASID in KFD interrupt")) + if (pasid == 0) return 0; /* Interrupt types we care about: various signals and faults. @@ -274,37 +217,66 @@ static void event_interrupt_wq_v10(struct kfd_node *dev, SQ_INTERRUPT_WORD_WAVE_CTXID1, ENCODING); switch (encoding) { case SQ_INTERRUPT_WORD_ENCODING_AUTO: - pr_debug_ratelimited( + dev_dbg_ratelimited( + dev->adev->dev, "sq_intr: auto, se %d, ttrace %d, wlt %d, ttrac_buf0_full %d, ttrac_buf1_full %d, ttrace_utc_err %d\n", - REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_AUTO_CTXID1, - SE_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, - THREAD_TRACE), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, - WLT), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, - THREAD_TRACE_BUF0_FULL), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, - THREAD_TRACE_BUF1_FULL), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, - THREAD_TRACE_UTC_ERROR)); + REG_GET_FIELD( + context_id1, + SQ_INTERRUPT_WORD_AUTO_CTXID1, + SE_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_AUTO_CTXID0, + THREAD_TRACE), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_AUTO_CTXID0, + WLT), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_AUTO_CTXID0, + THREAD_TRACE_BUF0_FULL), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_AUTO_CTXID0, + THREAD_TRACE_BUF1_FULL), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_AUTO_CTXID0, + THREAD_TRACE_UTC_ERROR)); break; case SQ_INTERRUPT_WORD_ENCODING_INST: - pr_debug_ratelimited("sq_intr: inst, se %d, data 0x%x, sa %d, priv %d, wave_id %d, simd_id %d, wgp_id %d\n", - REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, - SE_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, - DATA), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, - SA_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, - PRIV), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, - WAVE_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, - SIMD_ID), - REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, - WGP_ID)); + dev_dbg_ratelimited( + dev->adev->dev, + "sq_intr: inst, se %d, data 0x%x, sa %d, priv %d, wave_id %d, simd_id %d, wgp_id %d\n", + REG_GET_FIELD( + context_id1, + SQ_INTERRUPT_WORD_WAVE_CTXID1, + SE_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID0, + DATA), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID0, + SA_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID0, + PRIV), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID0, + WAVE_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID0, + SIMD_ID), + REG_GET_FIELD( + context_id1, + SQ_INTERRUPT_WORD_WAVE_CTXID1, + WGP_ID)); if (context_id0 & SQ_INTERRUPT_WORD_WAVE_CTXID0__PRIV_MASK) { if (kfd_set_dbg_ev_from_interrupt(dev, pasid, KFD_DEBUG_DOORBELL_ID(context_id0), @@ -316,27 +288,38 @@ static void event_interrupt_wq_v10(struct kfd_node *dev, case SQ_INTERRUPT_WORD_ENCODING_ERROR: sq_intr_err_type = REG_GET_FIELD(context_id0, KFD_CTXID0, ERR_TYPE); - pr_warn_ratelimited("sq_intr: error, se %d, data 0x%x, sa %d, priv %d, wave_id %d, simd_id %d, wgp_id %d, err_type %d\n", - REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, - SE_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, - DATA), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, - SA_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, - PRIV), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, - WAVE_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, - SIMD_ID), - REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, - WGP_ID), + dev_warn_ratelimited( + dev->adev->dev, + "sq_intr: error, se %d, data 0x%x, sa %d, priv %d, wave_id %d, simd_id %d, wgp_id %d, err_type %d\n", + REG_GET_FIELD( + context_id1, + SQ_INTERRUPT_WORD_WAVE_CTXID1, + SE_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID0, + DATA), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID0, + SA_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID0, + PRIV), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID0, + WAVE_ID), + REG_GET_FIELD( + context_id0, + SQ_INTERRUPT_WORD_WAVE_CTXID0, + SIMD_ID), + REG_GET_FIELD( + context_id1, + SQ_INTERRUPT_WORD_WAVE_CTXID1, + WGP_ID), sq_intr_err_type); - if (sq_intr_err_type != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST && - sq_intr_err_type != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) { - event_interrupt_poison_consumption(dev, pasid, source_id); - return; - } break; default: break; @@ -362,38 +345,14 @@ static void event_interrupt_wq_v10(struct kfd_node *dev, client_id == SOC15_IH_CLIENTID_SDMA7) { if (source_id == SOC15_INTSRC_SDMA_TRAP) { kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28); - } else if (source_id == SOC15_INTSRC_SDMA_ECC) { - event_interrupt_poison_consumption(dev, pasid, source_id); - return; } } else if (client_id == SOC15_IH_CLIENTID_VMC || client_id == SOC15_IH_CLIENTID_VMC1 || client_id == SOC15_IH_CLIENTID_UTCL2) { struct kfd_vm_fault_info info = {0}; uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry); - uint32_t node_id = SOC15_NODEID_FROM_IH_ENTRY(ih_ring_entry); - uint32_t vmid_type = SOC15_VMID_TYPE_FROM_IH_ENTRY(ih_ring_entry); - int hub_inst = 0; struct kfd_hsa_memory_exception_data exception_data; - /* gfxhub */ - if (!vmid_type && dev->adev->gfx.funcs->ih_node_to_logical_xcc) { - hub_inst = dev->adev->gfx.funcs->ih_node_to_logical_xcc(dev->adev, - node_id); - if (hub_inst < 0) - hub_inst = 0; - } - - /* mmhub */ - if (vmid_type && client_id == SOC15_IH_CLIENTID_VMC) - hub_inst = node_id / 4; - - if (amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev, - hub_inst, vmid_type)) { - event_interrupt_poison_consumption(dev, pasid, client_id); - return; - } - info.vmid = vmid; info.mc_id = client_id; info.page_addr = ih_ring_entry[4] | |
