diff options
author | Yuri Nudelman <ynudelman@habana.ai> | 2021-05-25 14:49:52 +0300 |
---|---|---|
committer | Oded Gabbay <ogabbay@kernel.org> | 2021-06-18 15:23:41 +0300 |
commit | 8e8125f192288802267157f613c0ca654dfbde8e (patch) | |
tree | 0f64fe8484408c553eece8879d5421d005c63eb4 /drivers/misc/habanalabs/common/command_submission.c | |
parent | 254fac6d1a73aac40aa4d423c993965987728040 (diff) |
habanalabs: add debug flag to prevent failure on timeout
Sometimes it is useful to allow the command to continue running despite
the timeout occurred, to differentiate between really stuck or just very
time consuming commands. This can be achieved by passing a new debug
flag alongside the cs, HL_CS_FLAGS_SKIP_RESET_ON_TIMEOUT.
Anyway, if the timeout occurred, a warning print shall be issued,
however this shall not fail the submission.
Signed-off-by: Yuri Nudelman <ynudelman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Diffstat (limited to 'drivers/misc/habanalabs/common/command_submission.c')
-rw-r--r-- | drivers/misc/habanalabs/common/command_submission.c | 25 |
1 files changed, 20 insertions, 5 deletions
diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index ecd96fbe3150..6d51f54030c1 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -556,6 +556,13 @@ out: else if (!cs->submitted) cs->fence->error = -EBUSY; + if (unlikely(cs->skip_reset_on_timeout)) { + dev_err(hdev->dev, + "Command submission %llu completed after %llu (s)\n", + cs->sequence, + div_u64(jiffies - cs->submission_time_jiffies, HZ)); + } + if (cs->timestamp) cs->fence->timestamp = ktime_get(); complete_all(&cs->fence->completion); @@ -571,6 +578,8 @@ static void cs_timedout(struct work_struct *work) int rc; struct hl_cs *cs = container_of(work, struct hl_cs, work_tdr.work); + bool skip_reset_on_timeout = cs->skip_reset_on_timeout; + rc = cs_get_unless_zero(cs); if (!rc) return; @@ -581,7 +590,8 @@ static void cs_timedout(struct work_struct *work) } /* Mark the CS is timed out so we won't try to cancel its TDR */ - cs->timedout = true; + if (likely(!skip_reset_on_timeout)) + cs->timedout = true; hdev = cs->ctx->hdev; @@ -613,10 +623,12 @@ static void cs_timedout(struct work_struct *work) cs_put(cs); - if (hdev->reset_on_lockup) - hl_device_reset(hdev, HL_RESET_TDR); - else - hdev->needs_reset = true; + if (likely(!skip_reset_on_timeout)) { + if (hdev->reset_on_lockup) + hl_device_reset(hdev, HL_RESET_TDR); + else + hdev->needs_reset = true; + } } static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx, @@ -650,6 +662,9 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx, cs->type = cs_type; cs->timestamp = !!(flags & HL_CS_FLAGS_TIMESTAMP); cs->timeout_jiffies = timeout; + cs->skip_reset_on_timeout = + !!(flags & HL_CS_FLAGS_SKIP_RESET_ON_TIMEOUT); + cs->submission_time_jiffies = jiffies; INIT_LIST_HEAD(&cs->job_list); INIT_DELAYED_WORK(&cs->work_tdr, cs_timedout); kref_init(&cs->refcount); |