summaryrefslogtreecommitdiff
path: root/drivers/accel/habanalabs/common/device.c
diff options
context:
space:
mode:
authorFarah Kassabri <fkassabri@habana.ai>2024-02-21 11:47:12 +0200
committerOfir Bitton <obitton@habana.ai>2024-06-23 09:52:53 +0300
commit31bd26931d036593531dbc9b5dd0669fe9d53155 (patch)
tree57ef02ead7dd7ef68d5ce4182521751479778cff /drivers/accel/habanalabs/common/device.c
parent42f04ca65c7294ce7c641d2195086f2c99323320 (diff)
accel/habanalabs: add heartbeat debug info
It is hard to debug the reason for heartbeat check failures. As an attempt to ease this task, this patch will provide more information when this failure happens. Heartbeat checks the communication with FW, so printing the CPU queue pi/ci and the counter of how many times that event was received would help in debugging the issue. Signed-off-by: Farah Kassabri <fkassabri@habana.ai> Reviewed-by: Ofir Bitton <obitton@habana.ai> Signed-off-by: Ofir Bitton <obitton@habana.ai>
Diffstat (limited to 'drivers/accel/habanalabs/common/device.c')
-rw-r--r--drivers/accel/habanalabs/common/device.c12
1 files changed, 12 insertions, 0 deletions
diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index bb3f44392908..35502e938b5d 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -1052,12 +1052,22 @@ static bool is_pci_link_healthy(struct hl_device *hdev)
static bool hl_device_eq_heartbeat_received(struct hl_device *hdev)
{
struct asic_fixed_properties *prop = &hdev->asic_prop;
+ u32 cpu_q_id;
if (!prop->cpucp_info.eq_health_check_supported)
return true;
if (!hdev->eq_heartbeat_received) {
+ cpu_q_id = hdev->heartbeat_debug_info.cpu_queue_id;
+
dev_err(hdev->dev, "EQ heartbeat event was not received!\n");
+
+ dev_err(hdev->dev, "Heartbeat events counter: %u, Q_PI: %u, Q_CI: %u, EQ CI: %u, EQ prev: %u\n",
+ hdev->heartbeat_debug_info.heartbeat_event_counter,
+ hdev->kernel_queues[cpu_q_id].pi,
+ atomic_read(&hdev->kernel_queues[cpu_q_id].ci),
+ hdev->event_queue.ci,
+ hdev->event_queue.prev_eqe_index);
return false;
}
@@ -1138,6 +1148,8 @@ static int device_late_init(struct hl_device *hdev)
hdev->high_pll = hdev->asic_prop.high_pll;
if (hdev->heartbeat) {
+ hdev->heartbeat_debug_info.heartbeat_event_counter = 0;
+
/*
* Before scheduling the heartbeat driver will check if eq event has received.
* for the first schedule we need to set the indication as true then for the next