From af5f7eea45e1b177db961c4706625f4cf545c063 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Thu, 28 Feb 2019 10:46:21 +0200 Subject: habanalabs: soft-reset device if context-switch fails This patch fix a bug in the driver, where if the TPC or MME remains in non-IDLE even after all the command submissions are done (due to user bug or malicious user), then future command submissions will fail in the context-switch stage and the driver will remain in "stuck" mode. The fix is to do a soft-reset of the device in case the context-switch fails, because the device should be IDLE during context-switch. If it is not IDLE, then something is wrong and we should reset the compute engines. Signed-off-by: Oded Gabbay Signed-off-by: Greg Kroah-Hartman --- drivers/misc/habanalabs/command_submission.c | 16 +++++++++------- drivers/misc/habanalabs/goya/goya.c | 2 +- 2 files changed, 10 insertions(+), 8 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c index 25ad9d805cfa..3525236ed8d9 100644 --- a/drivers/misc/habanalabs/command_submission.c +++ b/drivers/misc/habanalabs/command_submission.c @@ -622,13 +622,15 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data) "Failed to switch to context %d, rejecting CS! %d\n", ctx->asid, rc); /* - * If we timedout, we need to soft-reset because - * QMAN is probably stuck. However, we can't - * call to reset here directly because of - * deadlock, so need to do it at the very end - * of this function + * If we timedout, or if the device is not IDLE + * while we want to do context-switch (-EBUSY), + * we need to soft-reset because QMAN is + * probably stuck. However, we can't call to + * reset here directly because of deadlock, so + * need to do it at the very end of this + * function */ - if (rc == -ETIMEDOUT) + if ((rc == -ETIMEDOUT) || (rc == -EBUSY)) need_soft_reset = true; mutex_unlock(&hpriv->restore_phase_mutex); goto out; @@ -706,7 +708,7 @@ out: args->out.seq = cs_seq; } - if ((rc == -ETIMEDOUT) && (need_soft_reset)) + if (((rc == -ETIMEDOUT) || (rc == -EBUSY)) && (need_soft_reset)) hl_device_reset(hdev, false, false); return rc; diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 39824214ce61..11597432f519 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -3138,7 +3138,7 @@ static int goya_send_job_on_qman0(struct hl_device *hdev, struct hl_cs_job *job) if (!hdev->asic_funcs->is_device_idle(hdev)) { dev_err_ratelimited(hdev->dev, "Can't send KMD job on QMAN0 if device is not idle\n"); - return -EFAULT; + return -EBUSY; } fence_ptr = hdev->asic_funcs->dma_pool_zalloc(hdev, 4, GFP_KERNEL, -- cgit