summaryrefslogtreecommitdiff
path: root/drivers/misc/habanalabs/common/command_submission.c
diff options
context:
space:
mode:
authorYuri Nudelman <ynudelman@habana.ai>2021-05-25 14:49:52 +0300
committerOded Gabbay <ogabbay@kernel.org>2021-06-18 15:23:41 +0300
commit8e8125f192288802267157f613c0ca654dfbde8e (patch)
tree0f64fe8484408c553eece8879d5421d005c63eb4 /drivers/misc/habanalabs/common/command_submission.c
parent254fac6d1a73aac40aa4d423c993965987728040 (diff)
habanalabs: add debug flag to prevent failure on timeout
Sometimes it is useful to allow the command to continue running despite the timeout occurred, to differentiate between really stuck or just very time consuming commands. This can be achieved by passing a new debug flag alongside the cs, HL_CS_FLAGS_SKIP_RESET_ON_TIMEOUT. Anyway, if the timeout occurred, a warning print shall be issued, however this shall not fail the submission. Signed-off-by: Yuri Nudelman <ynudelman@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Diffstat (limited to 'drivers/misc/habanalabs/common/command_submission.c')
-rw-r--r--drivers/misc/habanalabs/common/command_submission.c25
1 files changed, 20 insertions, 5 deletions
diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index ecd96fbe3150..6d51f54030c1 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -556,6 +556,13 @@ out:
else if (!cs->submitted)
cs->fence->error = -EBUSY;
+ if (unlikely(cs->skip_reset_on_timeout)) {
+ dev_err(hdev->dev,
+ "Command submission %llu completed after %llu (s)\n",
+ cs->sequence,
+ div_u64(jiffies - cs->submission_time_jiffies, HZ));
+ }
+
if (cs->timestamp)
cs->fence->timestamp = ktime_get();
complete_all(&cs->fence->completion);
@@ -571,6 +578,8 @@ static void cs_timedout(struct work_struct *work)
int rc;
struct hl_cs *cs = container_of(work, struct hl_cs,
work_tdr.work);
+ bool skip_reset_on_timeout = cs->skip_reset_on_timeout;
+
rc = cs_get_unless_zero(cs);
if (!rc)
return;
@@ -581,7 +590,8 @@ static void cs_timedout(struct work_struct *work)
}
/* Mark the CS is timed out so we won't try to cancel its TDR */
- cs->timedout = true;
+ if (likely(!skip_reset_on_timeout))
+ cs->timedout = true;
hdev = cs->ctx->hdev;
@@ -613,10 +623,12 @@ static void cs_timedout(struct work_struct *work)
cs_put(cs);
- if (hdev->reset_on_lockup)
- hl_device_reset(hdev, HL_RESET_TDR);
- else
- hdev->needs_reset = true;
+ if (likely(!skip_reset_on_timeout)) {
+ if (hdev->reset_on_lockup)
+ hl_device_reset(hdev, HL_RESET_TDR);
+ else
+ hdev->needs_reset = true;
+ }
}
static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
@@ -650,6 +662,9 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
cs->type = cs_type;
cs->timestamp = !!(flags & HL_CS_FLAGS_TIMESTAMP);
cs->timeout_jiffies = timeout;
+ cs->skip_reset_on_timeout =
+ !!(flags & HL_CS_FLAGS_SKIP_RESET_ON_TIMEOUT);
+ cs->submission_time_jiffies = jiffies;
INIT_LIST_HEAD(&cs->job_list);
INIT_DELAYED_WORK(&cs->work_tdr, cs_timedout);
kref_init(&cs->refcount);