summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c')
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c116
1 files changed, 73 insertions, 43 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
index 7e2859736a55..2788a52714d1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
@@ -148,44 +148,69 @@ enum SQ_INTERRUPT_ERROR_TYPE {
#define KFD_CTXID0_DOORBELL_ID(ctxid0) ((ctxid0) & \
KFD_CTXID0_DOORBELL_ID_MASK)
-static void print_sq_intr_info_auto(uint32_t context_id0, uint32_t context_id1)
+static void print_sq_intr_info_auto(struct kfd_node *dev, uint32_t context_id0,
+ uint32_t context_id1)
{
- pr_debug_ratelimited(
+ dev_dbg_ratelimited(
+ dev->adev->dev,
"sq_intr: auto, ttrace %d, wlt %d, ttrace_buf_full %d, reg_tms %d, cmd_tms %d, host_cmd_ovf %d, host_reg_ovf %d, immed_ovf %d, ttrace_utc_err %d\n",
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, THREAD_TRACE),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0,
+ THREAD_TRACE),
REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, WLT),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, THREAD_TRACE_BUF_FULL),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, REG_TIMESTAMP),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, CMD_TIMESTAMP),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, HOST_CMD_OVERFLOW),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, HOST_REG_OVERFLOW),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, IMMED_OVERFLOW),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, THREAD_TRACE_UTC_ERROR));
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0,
+ THREAD_TRACE_BUF_FULL),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0,
+ REG_TIMESTAMP),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0,
+ CMD_TIMESTAMP),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0,
+ HOST_CMD_OVERFLOW),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0,
+ HOST_REG_OVERFLOW),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0,
+ IMMED_OVERFLOW),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0,
+ THREAD_TRACE_UTC_ERROR));
}
-static void print_sq_intr_info_inst(uint32_t context_id0, uint32_t context_id1)
+static void print_sq_intr_info_inst(struct kfd_node *dev, uint32_t context_id0,
+ uint32_t context_id1)
{
- pr_debug_ratelimited(
+ dev_dbg_ratelimited(
+ dev->adev->dev,
"sq_intr: inst, data 0x%08x, sh %d, priv %d, wave_id %d, simd_id %d, wgp_id %d\n",
REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, DATA),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, SH_ID),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0,
+ SH_ID),
REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, PRIV),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, WAVE_ID),
- REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, SIMD_ID),
- REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, WGP_ID));
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0,
+ WAVE_ID),
+ REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1,
+ SIMD_ID),
+ REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1,
+ WGP_ID));
}
-static void print_sq_intr_info_error(uint32_t context_id0, uint32_t context_id1)
+static void print_sq_intr_info_error(struct kfd_node *dev, uint32_t context_id0,
+ uint32_t context_id1)
{
- pr_warn_ratelimited(
+ dev_warn_ratelimited(
+ dev->adev->dev,
"sq_intr: error, detail 0x%08x, type %d, sh %d, priv %d, wave_id %d, simd_id %d, wgp_id %d\n",
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, DETAIL),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, TYPE),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, SH_ID),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, PRIV),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, WAVE_ID),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID1, SIMD_ID),
- REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID1, WGP_ID));
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0,
+ DETAIL),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0,
+ TYPE),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0,
+ SH_ID),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0,
+ PRIV),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0,
+ WAVE_ID),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID1,
+ SIMD_ID),
+ REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID1,
+ WGP_ID));
}
static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
@@ -193,7 +218,8 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
{
enum amdgpu_ras_block block = 0;
int ret = -EINVAL;
- struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+ uint32_t reset = 0;
+ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL);
if (!p)
return;
@@ -212,10 +238,13 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
if (dev->dqm->ops.reset_queues)
ret = dev->dqm->ops.reset_queues(dev->dqm, pasid);
block = AMDGPU_RAS_BLOCK__GFX;
+ if (ret)
+ reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
break;
case SOC21_INTSRC_SDMA_ECC:
default:
block = AMDGPU_RAS_BLOCK__GFX;
+ reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
break;
}
@@ -223,10 +252,7 @@ static void event_interrupt_poison_consumption_v11(struct kfd_node *dev,
/* resetting queue passes, do page retirement without gpu reset
resetting queue fails, fallback to gpu reset solution */
- if (!ret)
- amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false);
- else
- amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
+ amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);
}
static bool event_interrupt_isr_v11(struct kfd_node *dev,
@@ -254,14 +280,14 @@ static bool event_interrupt_isr_v11(struct kfd_node *dev,
(context_id0 & AMDGPU_FENCE_MES_QUEUE_FLAG))
return false;
- pr_debug("client id 0x%x, source id %d, vmid %d, pasid 0x%x. raw data:\n",
- client_id, source_id, vmid, pasid);
- pr_debug("%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n",
- data[0], data[1], data[2], data[3],
- data[4], data[5], data[6], data[7]);
+ dev_dbg(dev->adev->dev,
+ "client id 0x%x, source id %d, vmid %d, pasid 0x%x. raw data:\n",
+ client_id, source_id, vmid, pasid);
+ dev_dbg(dev->adev->dev, "%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n",
+ data[0], data[1], data[2], data[3], data[4], data[5], data[6],
+ data[7]);
- /* If there is no valid PASID, it's likely a bug */
- if (WARN_ONCE(pasid == 0, "Bug: No PASID in KFD interrupt"))
+ if (pasid == 0)
return false;
/* Interrupt types we care about: various signals and faults.
@@ -328,11 +354,15 @@ static void event_interrupt_wq_v11(struct kfd_node *dev,
/* CP */
if (source_id == SOC15_INTSRC_CP_END_OF_PIPE)
kfd_signal_event_interrupt(pasid, context_id0, 32);
- else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE)
- kfd_set_dbg_ev_from_interrupt(dev, pasid,
- KFD_CTXID0_DOORBELL_ID(context_id0),
+ else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE &&
+ KFD_DBG_EC_TYPE_IS_PACKET(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0))) {
+ u32 doorbell_id = KFD_CTXID0_DOORBELL_ID(context_id0);
+
+ kfd_set_dbg_ev_from_interrupt(dev, pasid, doorbell_id,
KFD_EC_MASK(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0)),
NULL, 0);
+ kfd_dqm_suspend_bad_queue_mes(dev, pasid, doorbell_id);
+ }
/* SDMA */
else if (source_id == SOC21_INTSRC_SDMA_TRAP)
@@ -348,10 +378,10 @@ static void event_interrupt_wq_v11(struct kfd_node *dev,
SQ_INTERRUPT_WORD_WAVE_CTXID1, ENCODING);
switch (sq_int_enc) {
case SQ_INTERRUPT_WORD_ENCODING_AUTO:
- print_sq_intr_info_auto(context_id0, context_id1);
+ print_sq_intr_info_auto(dev, context_id0, context_id1);
break;
case SQ_INTERRUPT_WORD_ENCODING_INST:
- print_sq_intr_info_inst(context_id0, context_id1);
+ print_sq_intr_info_inst(dev, context_id0, context_id1);
sq_int_priv = REG_GET_FIELD(context_id0,
SQ_INTERRUPT_WORD_WAVE_CTXID0, PRIV);
if (sq_int_priv && (kfd_set_dbg_ev_from_interrupt(dev, pasid,
@@ -361,7 +391,7 @@ static void event_interrupt_wq_v11(struct kfd_node *dev,
return;
break;
case SQ_INTERRUPT_WORD_ENCODING_ERROR:
- print_sq_intr_info_error(context_id0, context_id1);
+ print_sq_intr_info_error(dev, context_id0, context_id1);
sq_int_errtype = REG_GET_FIELD(context_id0,
SQ_INTERRUPT_WORD_ERROR_CTXID0, TYPE);
if (sq_int_errtype != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST &&