diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c | 146 |
1 files changed, 90 insertions, 56 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c index 0d53f6067422..2788a52714d1 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c @@ -26,6 +26,7 @@ #include "kfd_device_queue_manager.h" #include "ivsrcid/vmc/irqsrcs_vmc_1_0.h" #include "kfd_smi_events.h" +#include "kfd_debug.h" /* * GFX11 SQ Interrupts @@ -147,51 +148,78 @@ enum SQ_INTERRUPT_ERROR_TYPE { #define KFD_CTXID0_DOORBELL_ID(ctxid0) ((ctxid0) & \ KFD_CTXID0_DOORBELL_ID_MASK) -static void print_sq_intr_info_auto(uint32_t context_id0, uint32_t context_id1) +static void print_sq_intr_info_auto(struct kfd_node *dev, uint32_t context_id0, + uint32_t context_id1) { - pr_debug( + dev_dbg_ratelimited( + dev->adev->dev, "sq_intr: auto, ttrace %d, wlt %d, ttrace_buf_full %d, reg_tms %d, cmd_tms %d, host_cmd_ovf %d, host_reg_ovf %d, immed_ovf %d, ttrace_utc_err %d\n", - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, THREAD_TRACE), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, + THREAD_TRACE), REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, WLT), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, THREAD_TRACE_BUF_FULL), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, REG_TIMESTAMP), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, CMD_TIMESTAMP), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, HOST_CMD_OVERFLOW), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, HOST_REG_OVERFLOW), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, IMMED_OVERFLOW), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, THREAD_TRACE_UTC_ERROR)); + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, + THREAD_TRACE_BUF_FULL), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, + REG_TIMESTAMP), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, + CMD_TIMESTAMP), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, + HOST_CMD_OVERFLOW), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, + HOST_REG_OVERFLOW), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, + IMMED_OVERFLOW), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, + THREAD_TRACE_UTC_ERROR)); } -static void print_sq_intr_info_inst(uint32_t context_id0, uint32_t context_id1) +static void print_sq_intr_info_inst(struct kfd_node *dev, uint32_t context_id0, + uint32_t context_id1) { - pr_debug( + dev_dbg_ratelimited( + dev->adev->dev, "sq_intr: inst, data 0x%08x, sh %d, priv %d, wave_id %d, simd_id %d, wgp_id %d\n", REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, DATA), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, SH_ID), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, + SH_ID), REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, PRIV), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, WAVE_ID), - REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, SIMD_ID), - REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, WGP_ID)); + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, + WAVE_ID), + REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, + SIMD_ID), + REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, + WGP_ID)); } -static void print_sq_intr_info_error(uint32_t context_id0, uint32_t context_id1) +static void print_sq_intr_info_error(struct kfd_node *dev, uint32_t context_id0, + uint32_t context_id1) { - pr_warn( + dev_warn_ratelimited( + dev->adev->dev, "sq_intr: error, detail 0x%08x, type %d, sh %d, priv %d, wave_id %d, simd_id %d, wgp_id %d\n", - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, DETAIL), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, TYPE), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, SH_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, PRIV), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, WAVE_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID1, SIMD_ID), - REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID1, WGP_ID)); + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, + DETAIL), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, + TYPE), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, + SH_ID), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, + PRIV), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, + WAVE_ID), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID1, + SIMD_ID), + REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID1, + WGP_ID)); } -static void event_interrupt_poison_consumption_v11(struct kfd_dev *dev, +static void event_interrupt_poison_consumption_v11(struct kfd_node *dev, uint16_t pasid, uint16_t source_id) { + enum amdgpu_ras_block block = 0; int ret = -EINVAL; - struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + uint32_t reset = 0; + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL); if (!p) return; @@ -209,9 +237,14 @@ static void event_interrupt_poison_consumption_v11(struct kfd_dev *dev, case SOC15_INTSRC_SQ_INTERRUPT_MSG: if (dev->dqm->ops.reset_queues) ret = dev->dqm->ops.reset_queues(dev->dqm, pasid); + block = AMDGPU_RAS_BLOCK__GFX; + if (ret) + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; break; case SOC21_INTSRC_SDMA_ECC: default: + block = AMDGPU_RAS_BLOCK__GFX; + reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; break; } @@ -219,13 +252,10 @@ static void event_interrupt_poison_consumption_v11(struct kfd_dev *dev, /* resetting queue passes, do page retirement without gpu reset resetting queue fails, fallback to gpu reset solution */ - if (!ret) - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false); - else - amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true); + amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset); } -static bool event_interrupt_isr_v11(struct kfd_dev *dev, +static bool event_interrupt_isr_v11(struct kfd_node *dev, const uint32_t *ih_ring_entry, uint32_t *patched_ihre, bool *patched_flag) @@ -238,7 +268,7 @@ static bool event_interrupt_isr_v11(struct kfd_dev *dev, client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry); /* Only handle interrupts from KFD VMIDs */ vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry); - if (/*!KFD_IRQ_IS_FENCE(client_id, source_id) &&*/ + if (!KFD_IRQ_IS_FENCE(client_id, source_id) && (vmid < dev->vm_info.first_vmid_kfd || vmid > dev->vm_info.last_vmid_kfd)) return false; @@ -250,14 +280,14 @@ static bool event_interrupt_isr_v11(struct kfd_dev *dev, (context_id0 & AMDGPU_FENCE_MES_QUEUE_FLAG)) return false; - pr_debug("client id 0x%x, source id %d, vmid %d, pasid 0x%x. raw data:\n", - client_id, source_id, vmid, pasid); - pr_debug("%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n", - data[0], data[1], data[2], data[3], - data[4], data[5], data[6], data[7]); + dev_dbg(dev->adev->dev, + "client id 0x%x, source id %d, vmid %d, pasid 0x%x. raw data:\n", + client_id, source_id, vmid, pasid); + dev_dbg(dev->adev->dev, "%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n", + data[0], data[1], data[2], data[3], data[4], data[5], data[6], + data[7]); - /* If there is no valid PASID, it's likely a bug */ - if (WARN_ONCE(pasid == 0, "Bug: No PASID in KFD interrupt")) + if (pasid == 0) return false; /* Interrupt types we care about: various signals and faults. @@ -267,19 +297,19 @@ static bool event_interrupt_isr_v11(struct kfd_dev *dev, source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG || source_id == SOC15_INTSRC_CP_BAD_OPCODE || source_id == SOC21_INTSRC_SDMA_TRAP || - /* KFD_IRQ_IS_FENCE(client_id, source_id) || */ + KFD_IRQ_IS_FENCE(client_id, source_id) || (((client_id == SOC21_IH_CLIENTID_VMC) || ((client_id == SOC21_IH_CLIENTID_GFX) && (source_id == UTCL2_1_0__SRCID__FAULT))) && !amdgpu_no_queue_eviction_on_vm_fault); } -static void event_interrupt_wq_v11(struct kfd_dev *dev, +static void event_interrupt_wq_v11(struct kfd_node *dev, const uint32_t *ih_ring_entry) { uint16_t source_id, client_id, ring_id, pasid, vmid; uint32_t context_id0, context_id1; - uint8_t sq_int_enc, sq_int_errtype, sq_int_priv; + uint8_t sq_int_enc, sq_int_priv, sq_int_errtype; struct kfd_vm_fault_info info = {0}; struct kfd_hsa_memory_exception_data exception_data; @@ -312,9 +342,9 @@ static void event_interrupt_wq_v11(struct kfd_dev *dev, exception_data.failure.ReadOnly = info.prot_write ? 1 : 0; exception_data.failure.imprecise = 0; - /*kfd_set_dbg_ev_from_interrupt(dev, pasid, -1, + kfd_set_dbg_ev_from_interrupt(dev, pasid, -1, KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION), - &exception_data, sizeof(exception_data));*/ + &exception_data, sizeof(exception_data)); kfd_smi_event_update_vmfault(dev, pasid); /* GRBM, SDMA, SE, PMM */ @@ -324,11 +354,15 @@ static void event_interrupt_wq_v11(struct kfd_dev *dev, /* CP */ if (source_id == SOC15_INTSRC_CP_END_OF_PIPE) kfd_signal_event_interrupt(pasid, context_id0, 32); - /*else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) - kfd_set_dbg_ev_from_interrupt(dev, pasid, - KFD_CTXID0_DOORBELL_ID(context_id0), + else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE && + KFD_DBG_EC_TYPE_IS_PACKET(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0))) { + u32 doorbell_id = KFD_CTXID0_DOORBELL_ID(context_id0); + + kfd_set_dbg_ev_from_interrupt(dev, pasid, doorbell_id, KFD_EC_MASK(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0)), - NULL, 0);*/ + NULL, 0); + kfd_dqm_suspend_bad_queue_mes(dev, pasid, doorbell_id); + } /* SDMA */ else if (source_id == SOC21_INTSRC_SDMA_TRAP) @@ -344,20 +378,20 @@ static void event_interrupt_wq_v11(struct kfd_dev *dev, SQ_INTERRUPT_WORD_WAVE_CTXID1, ENCODING); switch (sq_int_enc) { case SQ_INTERRUPT_WORD_ENCODING_AUTO: - print_sq_intr_info_auto(context_id0, context_id1); + print_sq_intr_info_auto(dev, context_id0, context_id1); break; case SQ_INTERRUPT_WORD_ENCODING_INST: - print_sq_intr_info_inst(context_id0, context_id1); + print_sq_intr_info_inst(dev, context_id0, context_id1); sq_int_priv = REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, PRIV); - /*if (sq_int_priv && (kfd_set_dbg_ev_from_interrupt(dev, pasid, + if (sq_int_priv && (kfd_set_dbg_ev_from_interrupt(dev, pasid, KFD_CTXID0_DOORBELL_ID(context_id0), KFD_CTXID0_TRAP_CODE(context_id0), NULL, 0))) - return;*/ + return; break; case SQ_INTERRUPT_WORD_ENCODING_ERROR: - print_sq_intr_info_error(context_id0, context_id1); + print_sq_intr_info_error(dev, context_id0, context_id1); sq_int_errtype = REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, TYPE); if (sq_int_errtype != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST && @@ -373,8 +407,8 @@ static void event_interrupt_wq_v11(struct kfd_dev *dev, kfd_signal_event_interrupt(pasid, context_id0 & 0xffffff, 24); } - /*} else if (KFD_IRQ_IS_FENCE(client_id, source_id)) { - kfd_process_close_interrupt_drain(pasid);*/ + } else if (KFD_IRQ_IS_FENCE(client_id, source_id)) { + kfd_process_close_interrupt_drain(pasid); } } |
