From 12fb1ad70d65edc3405884792d044fa79df7244f Mon Sep 17 00:00:00 2001 From: Jonathan Kim Date: Fri, 22 Apr 2022 12:26:18 -0400 Subject: drm/amdkfd: update process interrupt handling for debug events The debugger must be notified by any debugger subscribed exception that comes from hardware interrupts. If a debugger session exits, any exceptions it subscribed to may still have interrupts in the interrupt ring buffer or KGD/KFD pipeline. To prevent a new session from inheriting stale interrupts, when a new queue is created, open an interrupt drain and allow the IH ring to drain from a timestamped checkpoint. Then inject a custom IV so that once the custom IV is picked up by the KFD, it's safe to close the drain and proceed with queue creation. The drain must also be on debug disable as SW interrupts may still be processed. Drain at this time and clear all the exception status. The debugger may also not be attached nor subscibed to certain exceptions so forward them directly to the runtime. GFX10 also requires its own IV processing, hence the creation of kfd_int_process_v10.c. This is because the IV from SQ interrupts are packed into a new continguous format unlike GFX9. To make this clear, a separate interrupting handling code file was created. Signed-off-by: Jonathan Kim Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 98 ++++++++++++++++++++++--- 1 file changed, 89 insertions(+), 9 deletions(-) (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c') diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c index 8cf58be80f4e..d5c9f30552e3 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c @@ -23,10 +23,40 @@ #include "kfd_priv.h" #include "kfd_events.h" +#include "kfd_debug.h" #include "soc15_int.h" #include "kfd_device_queue_manager.h" #include "kfd_smi_events.h" +/* + * GFX9 SQ Interrupts + * + * There are 3 encoding types of interrupts sourced from SQ sent as a 44-bit + * packet to the Interrupt Handler: + * Auto - Generated by the SQG (various cmd overflows, timestamps etc) + * Wave - Generated by S_SENDMSG through a shader program + * Error - HW generated errors (Illegal instructions, Memviols, EDC etc) + * + * The 44-bit packet is mapped as {context_id1[7:0],context_id0[31:0]} plus + * 4-bits for VMID (SOC15_VMID_FROM_IH_ENTRY) as such: + * + * - context_id0[27:26] + * Encoding type (0 = Auto, 1 = Wave, 2 = Error) + * + * - context_id0[13] + * PRIV bit indicates that Wave S_SEND or error occurred within trap + * + * - {context_id1[7:0],context_id0[31:28],context_id0[11:0]} + * 24-bit data with the following layout per encoding type: + * Auto - only context_id0[8:0] is used, which reports various interrupts + * generated by SQG. The rest is 0. + * Wave - user data sent from m0 via S_SENDMSG + * Error - Error type (context_id1[7:4]), Error Details (rest of bits) + * + * The other context_id bits show coordinates (SE/SH/CU/SIMD/WAVE) for wave + * S_SENDMSG and Errors. These are 0 for Auto. + */ + enum SQ_INTERRUPT_WORD_ENCODING { SQ_INTERRUPT_WORD_ENCODING_AUTO = 0x0, SQ_INTERRUPT_WORD_ENCODING_INST, @@ -84,12 +114,32 @@ enum SQ_INTERRUPT_ERROR_TYPE { #define SQ_INTERRUPT_WORD_WAVE_CTXID__SE_ID_MASK 0x03000000 #define SQ_INTERRUPT_WORD_WAVE_CTXID__ENCODING_MASK 0x0c000000 +/* GFX9 SQ interrupt 24-bit data from context_id<0,1> */ #define KFD_CONTEXT_ID_GET_SQ_INT_DATA(ctx0, ctx1) \ ((ctx0 & 0xfff) | ((ctx0 >> 16) & 0xf000) | ((ctx1 << 16) & 0xff0000)) #define KFD_SQ_INT_DATA__ERR_TYPE_MASK 0xF00000 #define KFD_SQ_INT_DATA__ERR_TYPE__SHIFT 20 +/* + * The debugger will send user data(m0) with PRIV=1 to indicate it requires + * notification from the KFD with the following queue id (DOORBELL_ID) and + * trap code (TRAP_CODE). + */ +#define KFD_INT_DATA_DEBUG_DOORBELL_MASK 0x0003ff +#define KFD_INT_DATA_DEBUG_TRAP_CODE_SHIFT 10 +#define KFD_INT_DATA_DEBUG_TRAP_CODE_MASK 0x07fc00 +#define KFD_DEBUG_DOORBELL_ID(sq_int_data) ((sq_int_data) & \ + KFD_INT_DATA_DEBUG_DOORBELL_MASK) +#define KFD_DEBUG_TRAP_CODE(sq_int_data) (((sq_int_data) & \ + KFD_INT_DATA_DEBUG_TRAP_CODE_MASK) \ + >> KFD_INT_DATA_DEBUG_TRAP_CODE_SHIFT) +#define KFD_DEBUG_CP_BAD_OP_ECODE_MASK 0x3fffc00 +#define KFD_DEBUG_CP_BAD_OP_ECODE_SHIFT 10 +#define KFD_DEBUG_CP_BAD_OP_ECODE(ctxid0) (((ctxid0) & \ + KFD_DEBUG_CP_BAD_OP_ECODE_MASK) \ + >> KFD_DEBUG_CP_BAD_OP_ECODE_SHIFT) + static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, uint16_t pasid, uint16_t client_id) { @@ -168,14 +218,16 @@ static bool event_interrupt_isr_v9(struct kfd_node *dev, uint16_t source_id, client_id, pasid, vmid; const uint32_t *data = ih_ring_entry; + source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry); + client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry); + /* Only handle interrupts from KFD VMIDs */ vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry); - if (vmid < dev->vm_info.first_vmid_kfd || - vmid > dev->vm_info.last_vmid_kfd) + if (!KFD_IRQ_IS_FENCE(client_id, source_id) && + (vmid < dev->vm_info.first_vmid_kfd || + vmid > dev->vm_info.last_vmid_kfd)) return false; - source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry); - client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry); pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry); /* Only handle clients we care about */ @@ -194,7 +246,8 @@ static bool event_interrupt_isr_v9(struct kfd_node *dev, client_id != SOC15_IH_CLIENTID_SE0SH && client_id != SOC15_IH_CLIENTID_SE1SH && client_id != SOC15_IH_CLIENTID_SE2SH && - client_id != SOC15_IH_CLIENTID_SE3SH) + client_id != SOC15_IH_CLIENTID_SE3SH && + !KFD_IRQ_IS_FENCE(client_id, source_id)) return false; /* This is a known issue for gfx9. Under non HWS, pasid is not set @@ -247,6 +300,7 @@ static bool event_interrupt_isr_v9(struct kfd_node *dev, source_id == SOC15_INTSRC_SDMA_ECC || source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG || source_id == SOC15_INTSRC_CP_BAD_OPCODE || + KFD_IRQ_IS_FENCE(client_id, source_id) || ((client_id == SOC15_IH_CLIENTID_VMC || client_id == SOC15_IH_CLIENTID_VMC1 || client_id == SOC15_IH_CLIENTID_UTCL2) && @@ -302,6 +356,13 @@ static void event_interrupt_wq_v9(struct kfd_node *dev, REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, SIMD_ID), REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID, CU_ID), sq_int_data); + if (context_id0 & SQ_INTERRUPT_WORD_WAVE_CTXID__PRIV_MASK) { + if (kfd_set_dbg_ev_from_interrupt(dev, pasid, + KFD_DEBUG_DOORBELL_ID(sq_int_data), + KFD_DEBUG_TRAP_CODE(sq_int_data), + NULL, 0)) + return; + } break; case SQ_INTERRUPT_WORD_ENCODING_ERROR: sq_intr_err = REG_GET_FIELD(sq_int_data, KFD_SQ_INT_DATA, ERR_TYPE); @@ -324,8 +385,12 @@ static void event_interrupt_wq_v9(struct kfd_node *dev, break; } kfd_signal_event_interrupt(pasid, context_id0 & 0xffffff, 24); - } else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) - kfd_signal_hw_exception_event(pasid); + } else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE) { + kfd_set_dbg_ev_from_interrupt(dev, pasid, + KFD_DEBUG_DOORBELL_ID(context_id0), + KFD_EC_MASK(KFD_DEBUG_CP_BAD_OP_ECODE(context_id0)), + NULL, 0); + } } else if (client_id == SOC15_IH_CLIENTID_SDMA0 || client_id == SOC15_IH_CLIENTID_SDMA1 || client_id == SOC15_IH_CLIENTID_SDMA2 || @@ -345,6 +410,7 @@ static void event_interrupt_wq_v9(struct kfd_node *dev, client_id == SOC15_IH_CLIENTID_UTCL2) { struct kfd_vm_fault_info info = {0}; uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry); + struct kfd_hsa_memory_exception_data exception_data; if (client_id == SOC15_IH_CLIENTID_UTCL2 && amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev)) { @@ -360,9 +426,23 @@ static void event_interrupt_wq_v9(struct kfd_node *dev, info.prot_read = ring_id & 0x10; info.prot_write = ring_id & 0x20; + memset(&exception_data, 0, sizeof(exception_data)); + exception_data.gpu_id = dev->id; + exception_data.va = (info.page_addr) << PAGE_SHIFT; + exception_data.failure.NotPresent = info.prot_valid ? 1 : 0; + exception_data.failure.NoExecute = info.prot_exec ? 1 : 0; + exception_data.failure.ReadOnly = info.prot_write ? 1 : 0; + exception_data.failure.imprecise = 0; + + kfd_set_dbg_ev_from_interrupt(dev, + pasid, + -1, + KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION), + &exception_data, + sizeof(exception_data)); kfd_smi_event_update_vmfault(dev, pasid); - kfd_dqm_evict_pasid(dev->dqm, pasid); - kfd_signal_vm_fault_event(dev, pasid, &info, NULL); + } else if (KFD_IRQ_IS_FENCE(client_id, source_id)) { + kfd_process_close_interrupt_drain(pasid); } } -- cgit