summaryrefslogtreecommitdiff
path: root/drivers/accel/habanalabs/common/firmware_if.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/accel/habanalabs/common/firmware_if.c')
-rw-r--r--drivers/accel/habanalabs/common/firmware_if.c400
1 files changed, 226 insertions, 174 deletions
diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c
index acbc1a6b5cb1..eeb6b2a80fc7 100644
--- a/drivers/accel/habanalabs/common/firmware_if.c
+++ b/drivers/accel/habanalabs/common/firmware_if.c
@@ -6,8 +6,9 @@
*/
#include "habanalabs.h"
-#include "../include/common/hl_boot_if.h"
+#include <linux/habanalabs/hl_boot_if.h>
+#include <linux/pci.h>
#include <linux/firmware.h>
#include <linux/crc32.h>
#include <linux/slab.h>
@@ -40,6 +41,31 @@ static char *comms_sts_str_arr[COMMS_STS_INVLD_LAST] = {
[COMMS_STS_TIMEOUT_ERR] = __stringify(COMMS_STS_TIMEOUT_ERR),
};
+/**
+ * hl_fw_version_cmp() - compares the FW version to a specific version
+ *
+ * @hdev: pointer to hl_device structure
+ * @major: major number of a reference version
+ * @minor: minor number of a reference version
+ * @subminor: sub-minor number of a reference version
+ *
+ * Return 1 if FW version greater than the reference version, -1 if it's
+ * smaller and 0 if versions are identical.
+ */
+int hl_fw_version_cmp(struct hl_device *hdev, u32 major, u32 minor, u32 subminor)
+{
+ if (hdev->fw_sw_major_ver != major)
+ return (hdev->fw_sw_major_ver > major) ? 1 : -1;
+
+ if (hdev->fw_sw_minor_ver != minor)
+ return (hdev->fw_sw_minor_ver > minor) ? 1 : -1;
+
+ if (hdev->fw_sw_sub_minor_ver != subminor)
+ return (hdev->fw_sw_sub_minor_ver > subminor) ? 1 : -1;
+
+ return 0;
+}
+
static char *extract_fw_ver_from_str(const char *fw_str)
{
char *str, *fw_ver, *whitespace;
@@ -345,43 +371,63 @@ int hl_fw_load_fw_to_device(struct hl_device *hdev, const char *fw_name,
int hl_fw_send_pci_access_msg(struct hl_device *hdev, u32 opcode, u64 value)
{
struct cpucp_packet pkt = {};
+ int rc;
pkt.ctl = cpu_to_le32(opcode << CPUCP_PKT_CTL_OPCODE_SHIFT);
pkt.value = cpu_to_le64(value);
- return hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL);
+ rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL);
+ if (rc)
+ dev_err(hdev->dev, "Failed to disable FW's PCI access\n");
+
+ return rc;
}
+/**
+ * hl_fw_send_cpu_message() - send CPU message to the device.
+ *
+ * @hdev: pointer to hl_device structure.
+ * @hw_queue_id: HW queue ID
+ * @msg: raw data of the message/packet
+ * @size: size of @msg in bytes
+ * @timeout_us: timeout in usec to wait for CPU reply on the message
+ * @result: return code reported by FW
+ *
+ * send message to the device CPU.
+ *
+ * Return: 0 on success, non-zero for failure.
+ * -ENOMEM: memory allocation failure
+ * -EAGAIN: CPU is disabled (try again when enabled)
+ * -ETIMEDOUT: timeout waiting for FW response
+ * -EIO: protocol error
+ */
int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
- u16 len, u32 timeout, u64 *result)
+ u16 size, u32 timeout_us, u64 *result)
{
struct hl_hw_queue *queue = &hdev->kernel_queues[hw_queue_id];
struct asic_fixed_properties *prop = &hdev->asic_prop;
+ u32 tmp, expected_ack_val, pi, opcode;
struct cpucp_packet *pkt;
dma_addr_t pkt_dma_addr;
struct hl_bd *sent_bd;
- u32 tmp, expected_ack_val, pi, opcode;
- int rc;
+ int rc = 0, fw_rc;
- pkt = hl_cpu_accessible_dma_pool_alloc(hdev, len, &pkt_dma_addr);
+ pkt = hl_cpu_accessible_dma_pool_alloc(hdev, size, &pkt_dma_addr);
if (!pkt) {
- dev_err(hdev->dev,
- "Failed to allocate DMA memory for packet to CPU\n");
+ dev_err(hdev->dev, "Failed to allocate DMA memory for packet to CPU\n");
return -ENOMEM;
}
- memcpy(pkt, msg, len);
+ memcpy(pkt, msg, size);
mutex_lock(&hdev->send_cpu_message_lock);
/* CPU-CP messages can be sent during soft-reset */
- if (hdev->disabled && !hdev->reset_info.in_compute_reset) {
- rc = 0;
+ if (hdev->disabled && !hdev->reset_info.in_compute_reset)
goto out;
- }
if (hdev->device_cpu_disabled) {
- rc = -EIO;
+ rc = -EAGAIN;
goto out;
}
@@ -397,7 +443,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
* Which means that we don't need to lock the access to the entire H/W
* queues module when submitting a JOB to the CPU queue.
*/
- hl_hw_queue_submit_bd(hdev, queue, hl_queue_inc_ptr(queue->pi), len, pkt_dma_addr);
+ hl_hw_queue_submit_bd(hdev, queue, hl_queue_inc_ptr(queue->pi), size, pkt_dma_addr);
if (prop->fw_app_cpu_boot_dev_sts0 & CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN)
expected_ack_val = queue->pi;
@@ -406,7 +452,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
rc = hl_poll_timeout_memory(hdev, &pkt->fence, tmp,
(tmp == expected_ack_val), 1000,
- timeout, true);
+ timeout_us, true);
hl_hw_queue_inc_ci_kernel(hdev, hw_queue_id);
@@ -414,19 +460,27 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
/* If FW performed reset just before sending it a packet, we will get a timeout.
* This is expected behavior, hence no need for error message.
*/
- if (!hl_device_operational(hdev, NULL) && !hdev->reset_info.in_compute_reset)
+ if (!hl_device_operational(hdev, NULL) && !hdev->reset_info.in_compute_reset) {
dev_dbg(hdev->dev, "Device CPU packet timeout (0x%x) due to FW reset\n",
tmp);
- else
- dev_err(hdev->dev, "Device CPU packet timeout (status = 0x%x)\n", tmp);
+ } else {
+ struct hl_bd *bd = queue->kernel_address;
+
+ bd += hl_pi_2_offset(pi);
+
+ dev_err(hdev->dev, "Device CPU packet timeout (status = 0x%x)\n"
+ "Pkt info[%u]: dma_addr: 0x%llx, kernel_addr: %p, len:0x%x, ctl: 0x%x, ptr:0x%llx, dram_bd:%u\n",
+ tmp, pi, pkt_dma_addr, (void *)pkt, bd->len, bd->ctl, bd->ptr,
+ queue->dram_bd);
+ }
hdev->device_cpu_disabled = true;
goto out;
}
tmp = le32_to_cpu(pkt->ctl);
- rc = (tmp & CPUCP_PKT_CTL_RC_MASK) >> CPUCP_PKT_CTL_RC_SHIFT;
- if (rc) {
+ fw_rc = (tmp & CPUCP_PKT_CTL_RC_MASK) >> CPUCP_PKT_CTL_RC_SHIFT;
+ if (fw_rc) {
opcode = (tmp & CPUCP_PKT_CTL_OPCODE_MASK) >> CPUCP_PKT_CTL_OPCODE_SHIFT;
if (!prop->supports_advanced_cpucp_rc) {
@@ -435,7 +489,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
goto scrub_descriptor;
}
- switch (rc) {
+ switch (fw_rc) {
case cpucp_packet_invalid:
dev_err(hdev->dev,
"CPU packet %d is not supported by F/W\n", opcode);
@@ -460,7 +514,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
/* propagate the return code from the f/w to the callers who want to check it */
if (result)
- *result = rc;
+ *result = fw_rc;
rc = -EIO;
@@ -480,7 +534,7 @@ scrub_descriptor:
out:
mutex_unlock(&hdev->send_cpu_message_lock);
- hl_cpu_accessible_dma_pool_free(hdev, len, pkt);
+ hl_cpu_accessible_dma_pool_free(hdev, size, pkt);
return rc;
}
@@ -501,7 +555,7 @@ int hl_fw_unmask_irq(struct hl_device *hdev, u16 event_type)
0, &result);
if (rc)
- dev_err(hdev->dev, "failed to unmask RAZWI IRQ %d", event_type);
+ dev_err(hdev->dev, "failed to unmask event %d", event_type);
return rc;
}
@@ -540,7 +594,7 @@ int hl_fw_unmask_irq_arr(struct hl_device *hdev, const u32 *irq_arr,
total_pkt_size, 0, &result);
if (rc)
- dev_err(hdev->dev, "failed to unmask IRQ array\n");
+ dev_err(hdev->dev, "failed to unmask event array\n");
kfree(pkt);
@@ -550,7 +604,7 @@ int hl_fw_unmask_irq_arr(struct hl_device *hdev, const u32 *irq_arr,
int hl_fw_test_cpu_queue(struct hl_device *hdev)
{
struct cpucp_packet test_pkt = {};
- u64 result;
+ u64 result = 0;
int rc;
test_pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEST <<
@@ -623,16 +677,14 @@ int hl_fw_send_device_activity(struct hl_device *hdev, bool open)
int hl_fw_send_heartbeat(struct hl_device *hdev)
{
struct cpucp_packet hb_pkt;
- u64 result;
+ u64 result = 0;
int rc;
memset(&hb_pkt, 0, sizeof(hb_pkt));
- hb_pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEST <<
- CPUCP_PKT_CTL_OPCODE_SHIFT);
+ hb_pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEST << CPUCP_PKT_CTL_OPCODE_SHIFT);
hb_pkt.value = cpu_to_le64(CPUCP_PACKET_FENCE_VAL);
- rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &hb_pkt,
- sizeof(hb_pkt), 0, &result);
+ rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &hb_pkt, sizeof(hb_pkt), 0, &result);
if ((rc) || (result != CPUCP_PACKET_FENCE_VAL))
return -EIO;
@@ -643,42 +695,32 @@ int hl_fw_send_heartbeat(struct hl_device *hdev)
rc = -EIO;
}
+ hdev->heartbeat_debug_info.last_pq_heartbeat_ts = ktime_get_real_seconds();
+
return rc;
}
-static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val,
- u32 sts_val)
+static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val, u32 sts_val)
{
bool err_exists = false;
if (!(err_val & CPU_BOOT_ERR0_ENABLED))
return false;
- if (err_val & CPU_BOOT_ERR0_DRAM_INIT_FAIL) {
- dev_err(hdev->dev,
- "Device boot error - DRAM initialization failed\n");
- err_exists = true;
- }
+ if (err_val & CPU_BOOT_ERR0_DRAM_INIT_FAIL)
+ dev_err(hdev->dev, "Device boot error - DRAM initialization failed\n");
- if (err_val & CPU_BOOT_ERR0_FIT_CORRUPTED) {
+ if (err_val & CPU_BOOT_ERR0_FIT_CORRUPTED)
dev_err(hdev->dev, "Device boot error - FIT image corrupted\n");
- err_exists = true;
- }
- if (err_val & CPU_BOOT_ERR0_TS_INIT_FAIL) {
- dev_err(hdev->dev,
- "Device boot error - Thermal Sensor initialization failed\n");
- err_exists = true;
- }
+ if (err_val & CPU_BOOT_ERR0_TS_INIT_FAIL)
+ dev_err(hdev->dev, "Device boot error - Thermal Sensor initialization failed\n");
if (err_val & CPU_BOOT_ERR0_BMC_WAIT_SKIPPED) {
if (hdev->bmc_enable) {
- dev_err(hdev->dev,
- "Device boot error - Skipped waiting for BMC\n");
- err_exists = true;
+ dev_err(hdev->dev, "Device boot error - Skipped waiting for BMC\n");
} else {
- dev_info(hdev->dev,
- "Device boot message - Skipped waiting for BMC\n");
+ dev_info(hdev->dev, "Device boot message - Skipped waiting for BMC\n");
/* This is an info so we don't want it to disable the
* device
*/
@@ -686,43 +728,29 @@ static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val,
}
}
- if (err_val & CPU_BOOT_ERR0_NIC_DATA_NOT_RDY) {
- dev_err(hdev->dev,
- "Device boot error - Serdes data from BMC not available\n");
- err_exists = true;
- }
+ if (err_val & CPU_BOOT_ERR0_NIC_DATA_NOT_RDY)
+ dev_err(hdev->dev, "Device boot error - Serdes data from BMC not available\n");
- if (err_val & CPU_BOOT_ERR0_NIC_FW_FAIL) {
- dev_err(hdev->dev,
- "Device boot error - NIC F/W initialization failed\n");
- err_exists = true;
- }
+ if (err_val & CPU_BOOT_ERR0_NIC_FW_FAIL)
+ dev_err(hdev->dev, "Device boot error - NIC F/W initialization failed\n");
- if (err_val & CPU_BOOT_ERR0_SECURITY_NOT_RDY) {
- dev_err(hdev->dev,
- "Device boot warning - security not ready\n");
- err_exists = true;
- }
+ if (err_val & CPU_BOOT_ERR0_SECURITY_NOT_RDY)
+ dev_err(hdev->dev, "Device boot warning - security not ready\n");
- if (err_val & CPU_BOOT_ERR0_SECURITY_FAIL) {
+ if (err_val & CPU_BOOT_ERR0_SECURITY_FAIL)
dev_err(hdev->dev, "Device boot error - security failure\n");
- err_exists = true;
- }
- if (err_val & CPU_BOOT_ERR0_EFUSE_FAIL) {
+ if (err_val & CPU_BOOT_ERR0_EFUSE_FAIL)
dev_err(hdev->dev, "Device boot error - eFuse failure\n");
- err_exists = true;
- }
- if (err_val & CPU_BOOT_ERR0_SEC_IMG_VER_FAIL) {
+ if (err_val & CPU_BOOT_ERR0_SEC_IMG_VER_FAIL)
dev_err(hdev->dev, "Device boot error - Failed to load preboot secondary image\n");
- err_exists = true;
- }
- if (err_val & CPU_BOOT_ERR0_PLL_FAIL) {
+ if (err_val & CPU_BOOT_ERR0_PLL_FAIL)
dev_err(hdev->dev, "Device boot error - PLL failure\n");
- err_exists = true;
- }
+
+ if (err_val & CPU_BOOT_ERR0_TMP_THRESH_INIT_FAIL)
+ dev_err(hdev->dev, "Device boot error - Failed to set threshold for temperature sensor\n");
if (err_val & CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL) {
/* Ignore this bit, don't prevent driver loading */
@@ -730,52 +758,32 @@ static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val,
err_val &= ~CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL;
}
- if (err_val & CPU_BOOT_ERR0_BINNING_FAIL) {
+ if (err_val & CPU_BOOT_ERR0_BINNING_FAIL)
dev_err(hdev->dev, "Device boot error - binning failure\n");
- err_exists = true;
- }
if (sts_val & CPU_BOOT_DEV_STS0_ENABLED)
dev_dbg(hdev->dev, "Device status0 %#x\n", sts_val);
+ if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED)
+ dev_err(hdev->dev, "Device boot warning - Skipped DRAM initialization\n");
+
+ if (err_val & CPU_BOOT_ERR_ENG_ARC_MEM_SCRUB_FAIL)
+ dev_err(hdev->dev, "Device boot error - ARC memory scrub failed\n");
+
+ /* All warnings should go here in order not to reach the unknown error validation */
if (err_val & CPU_BOOT_ERR0_EEPROM_FAIL) {
dev_err(hdev->dev, "Device boot error - EEPROM failure detected\n");
err_exists = true;
}
- /* All warnings should go here in order not to reach the unknown error validation */
- if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED) {
- dev_warn(hdev->dev,
- "Device boot warning - Skipped DRAM initialization\n");
- /* This is a warning so we don't want it to disable the
- * device
- */
- err_val &= ~CPU_BOOT_ERR0_DRAM_SKIPPED;
- }
-
- if (err_val & CPU_BOOT_ERR0_PRI_IMG_VER_FAIL) {
- dev_warn(hdev->dev,
- "Device boot warning - Failed to load preboot primary image\n");
- /* This is a warning so we don't want it to disable the
- * device as we have a secondary preboot image
- */
- err_val &= ~CPU_BOOT_ERR0_PRI_IMG_VER_FAIL;
- }
+ if (err_val & CPU_BOOT_ERR0_PRI_IMG_VER_FAIL)
+ dev_warn(hdev->dev, "Device boot warning - Failed to load preboot primary image\n");
- if (err_val & CPU_BOOT_ERR0_TPM_FAIL) {
- dev_warn(hdev->dev,
- "Device boot warning - TPM failure\n");
- /* This is a warning so we don't want it to disable the
- * device
- */
- err_val &= ~CPU_BOOT_ERR0_TPM_FAIL;
- }
+ if (err_val & CPU_BOOT_ERR0_TPM_FAIL)
+ dev_warn(hdev->dev, "Device boot warning - TPM failure\n");
- if (!err_exists && (err_val & ~CPU_BOOT_ERR0_ENABLED)) {
- dev_err(hdev->dev,
- "Device boot error - unknown ERR0 error 0x%08x\n", err_val);
+ if (err_val & CPU_BOOT_ERR_FATAL_MASK)
err_exists = true;
- }
/* return error only if it's in the predefined mask */
if (err_exists && ((err_val & ~CPU_BOOT_ERR0_ENABLED) &
@@ -931,7 +939,7 @@ static int hl_fw_send_msi_info_msg(struct hl_device *hdev)
{
struct cpucp_array_data_packet *pkt;
size_t total_pkt_size, data_size;
- u64 result;
+ u64 result = 0;
int rc;
/* skip sending this info for unsupported ASICs */
@@ -1022,11 +1030,10 @@ int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size)
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
HL_CPUCP_EEPROM_TIMEOUT_USEC, &result);
-
if (rc) {
- dev_err(hdev->dev,
- "Failed to handle CPU-CP EEPROM packet, error %d\n",
- rc);
+ if (rc != -EAGAIN)
+ dev_err(hdev->dev,
+ "Failed to handle CPU-CP EEPROM packet, error %d\n", rc);
goto out;
}
@@ -1067,7 +1074,9 @@ int hl_fw_get_monitor_dump(struct hl_device *hdev, void *data)
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
HL_CPUCP_MON_DUMP_TIMEOUT_USEC, &result);
if (rc) {
- dev_err(hdev->dev, "Failed to handle CPU-CP monitor-dump packet, error %d\n", rc);
+ if (rc != -EAGAIN)
+ dev_err(hdev->dev,
+ "Failed to handle CPU-CP monitor-dump packet, error %d\n", rc);
goto out;
}
@@ -1101,8 +1110,9 @@ int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev,
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
HL_CPUCP_INFO_TIMEOUT_USEC, &result);
if (rc) {
- dev_err(hdev->dev,
- "Failed to handle CPU-CP PCI info pkt, error %d\n", rc);
+ if (rc != -EAGAIN)
+ dev_err(hdev->dev,
+ "Failed to handle CPU-CP PCI info pkt, error %d\n", rc);
return rc;
}
counters->rx_throughput = result;
@@ -1116,8 +1126,9 @@ int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev,
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
HL_CPUCP_INFO_TIMEOUT_USEC, &result);
if (rc) {
- dev_err(hdev->dev,
- "Failed to handle CPU-CP PCI info pkt, error %d\n", rc);
+ if (rc != -EAGAIN)
+ dev_err(hdev->dev,
+ "Failed to handle CPU-CP PCI info pkt, error %d\n", rc);
return rc;
}
counters->tx_throughput = result;
@@ -1130,8 +1141,9 @@ int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev,
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
HL_CPUCP_INFO_TIMEOUT_USEC, &result);
if (rc) {
- dev_err(hdev->dev,
- "Failed to handle CPU-CP PCI info pkt, error %d\n", rc);
+ if (rc != -EAGAIN)
+ dev_err(hdev->dev,
+ "Failed to handle CPU-CP PCI info pkt, error %d\n", rc);
return rc;
}
counters->replay_cnt = (u32) result;
@@ -1151,9 +1163,9 @@ int hl_fw_cpucp_total_energy_get(struct hl_device *hdev, u64 *total_energy)
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
HL_CPUCP_INFO_TIMEOUT_USEC, &result);
if (rc) {
- dev_err(hdev->dev,
- "Failed to handle CpuCP total energy pkt, error %d\n",
- rc);
+ if (rc != -EAGAIN)
+ dev_err(hdev->dev,
+ "Failed to handle CpuCP total energy pkt, error %d\n", rc);
return rc;
}
@@ -1229,7 +1241,8 @@ int hl_fw_cpucp_pll_info_get(struct hl_device *hdev, u32 pll_index,
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
HL_CPUCP_INFO_TIMEOUT_USEC, &result);
if (rc) {
- dev_err(hdev->dev, "Failed to read PLL info, error %d\n", rc);
+ if (rc != -EAGAIN)
+ dev_err(hdev->dev, "Failed to read PLL info, error %d\n", rc);
return rc;
}
@@ -1256,7 +1269,8 @@ int hl_fw_cpucp_power_get(struct hl_device *hdev, u64 *power)
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
HL_CPUCP_INFO_TIMEOUT_USEC, &result);
if (rc) {
- dev_err(hdev->dev, "Failed to read power, error %d\n", rc);
+ if (rc != -EAGAIN)
+ dev_err(hdev->dev, "Failed to read power, error %d\n", rc);
return rc;
}
@@ -1293,8 +1307,9 @@ int hl_fw_dram_replaced_row_get(struct hl_device *hdev,
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
HL_CPUCP_INFO_TIMEOUT_USEC, &result);
if (rc) {
- dev_err(hdev->dev,
- "Failed to handle CPU-CP replaced rows info pkt, error %d\n", rc);
+ if (rc != -EAGAIN)
+ dev_err(hdev->dev,
+ "Failed to handle CPU-CP replaced rows info pkt, error %d\n", rc);
goto out;
}
@@ -1319,7 +1334,8 @@ int hl_fw_dram_pending_row_get(struct hl_device *hdev, u32 *pend_rows_num)
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, &result);
if (rc) {
- dev_err(hdev->dev,
+ if (rc != -EAGAIN)
+ dev_err(hdev->dev,
"Failed to handle CPU-CP pending rows info pkt, error %d\n", rc);
goto out;
}
@@ -1459,6 +1475,10 @@ static void detect_cpu_boot_status(struct hl_device *hdev, u32 status)
dev_err(hdev->dev,
"Device boot progress - Stuck in preboot after security initialization\n");
break;
+ case CPU_BOOT_STATUS_FW_SHUTDOWN_PREP:
+ dev_err(hdev->dev,
+ "Device boot progress - Stuck in preparation for shutdown\n");
+ break;
default:
dev_err(hdev->dev,
"Device boot progress - Invalid or unexpected status code %d\n", status);
@@ -1469,8 +1489,9 @@ static void detect_cpu_boot_status(struct hl_device *hdev, u32 status)
int hl_fw_wait_preboot_ready(struct hl_device *hdev)
{
struct pre_fw_load_props *pre_fw_load = &hdev->fw_loader.pre_fw_load;
- u32 status;
- int rc;
+ u32 status = 0, timeout;
+ int rc, tries = 1, fw_err = 0;
+ bool preboot_still_runs;
/* Need to check two possible scenarios:
*
@@ -1480,6 +1501,8 @@ int hl_fw_wait_preboot_ready(struct hl_device *hdev)
* All other status values - for older firmwares where the uboot was
* loaded from the FLASH
*/
+ timeout = pre_fw_load->wait_for_preboot_timeout;
+retry:
rc = hl_poll_timeout(
hdev,
pre_fw_load->cpu_boot_status_reg,
@@ -1488,20 +1511,37 @@ int hl_fw_wait_preboot_ready(struct hl_device *hdev)
(status == CPU_BOOT_STATUS_READY_TO_BOOT) ||
(status == CPU_BOOT_STATUS_WAITING_FOR_BOOT_FIT),
hdev->fw_poll_interval_usec,
- pre_fw_load->wait_for_preboot_timeout);
+ timeout);
+ /*
+ * if F/W reports "security-ready" it means preboot might take longer.
+ * If the field 'wait_for_preboot_extended_timeout' is non 0 we wait again
+ * with that timeout
+ */
+ preboot_still_runs = (status == CPU_BOOT_STATUS_SECURITY_READY ||
+ status == CPU_BOOT_STATUS_IN_PREBOOT ||
+ status == CPU_BOOT_STATUS_FW_SHUTDOWN_PREP ||
+ status == CPU_BOOT_STATUS_DRAM_RDY);
+
+ if (rc && tries && preboot_still_runs) {
+ tries--;
+ if (pre_fw_load->wait_for_preboot_extended_timeout) {
+ timeout = pre_fw_load->wait_for_preboot_extended_timeout;
+ goto retry;
+ }
+ }
- if (rc) {
+ /* If we read all FF, then something is totally wrong, no point
+ * of reading specific errors
+ */
+ if (status != -1)
+ fw_err = fw_read_errors(hdev, pre_fw_load->boot_err0_reg,
+ pre_fw_load->boot_err1_reg,
+ pre_fw_load->sts_boot_dev_sts0_reg,
+ pre_fw_load->sts_boot_dev_sts1_reg);
+ if (rc || fw_err) {
detect_cpu_boot_status(hdev, status);
- dev_err(hdev->dev, "CPU boot ready timeout (status = %d)\n", status);
-
- /* If we read all FF, then something is totally wrong, no point
- * of reading specific errors
- */
- if (status != -1)
- fw_read_errors(hdev, pre_fw_load->boot_err0_reg,
- pre_fw_load->boot_err1_reg,
- pre_fw_load->sts_boot_dev_sts0_reg,
- pre_fw_load->sts_boot_dev_sts1_reg);
+ dev_err(hdev->dev, "CPU boot %s (status = %d)\n",
+ fw_err ? "failed due to an error" : "ready timeout", status);
return -EIO;
}
@@ -1772,7 +1812,7 @@ static void hl_fw_dynamic_send_cmd(struct hl_device *hdev,
val = FIELD_PREP(COMMS_COMMAND_CMD_MASK, cmd);
val |= FIELD_PREP(COMMS_COMMAND_SIZE_MASK, size);
- trace_habanalabs_comms_send_cmd(hdev->dev, comms_cmd_str_arr[cmd]);
+ trace_habanalabs_comms_send_cmd(&hdev->pdev->dev, comms_cmd_str_arr[cmd]);
WREG32(le32_to_cpu(dyn_regs->kmd_msg_to_cpu), val);
}
@@ -1830,7 +1870,7 @@ static int hl_fw_dynamic_wait_for_status(struct hl_device *hdev,
dyn_regs = &fw_loader->dynamic_loader.comm_desc.cpu_dyn_regs;
- trace_habanalabs_comms_wait_status(hdev->dev, comms_sts_str_arr[expected_status]);
+ trace_habanalabs_comms_wait_status(&hdev->pdev->dev, comms_sts_str_arr[expected_status]);
/* Wait for expected status */
rc = hl_poll_timeout(
@@ -1847,7 +1887,8 @@ static int hl_fw_dynamic_wait_for_status(struct hl_device *hdev,
return -EIO;
}
- trace_habanalabs_comms_wait_status_done(hdev->dev, comms_sts_str_arr[expected_status]);
+ trace_habanalabs_comms_wait_status_done(&hdev->pdev->dev,
+ comms_sts_str_arr[expected_status]);
/*
* skip storing FW response for NOOP to preserve the actual desired
@@ -1921,7 +1962,7 @@ int hl_fw_dynamic_send_protocol_cmd(struct hl_device *hdev,
{
int rc;
- trace_habanalabs_comms_protocol_cmd(hdev->dev, comms_cmd_str_arr[cmd]);
+ trace_habanalabs_comms_protocol_cmd(&hdev->pdev->dev, comms_cmd_str_arr[cmd]);
/* first send clear command to clean former commands */
rc = hl_fw_dynamic_send_clear_cmd(hdev, fw_loader);
@@ -2060,7 +2101,7 @@ static int hl_fw_dynamic_validate_descriptor(struct hl_device *hdev,
* note that no alignment/stride address issues here as all structures
* are 64 bit padded.
*/
- data_ptr = (u8 *)fw_desc + sizeof(struct comms_desc_header);
+ data_ptr = (u8 *)fw_desc + sizeof(struct comms_msg_header);
data_size = le16_to_cpu(fw_desc->header.size);
data_crc32 = hl_fw_compat_crc32(data_ptr, data_size);
@@ -2214,11 +2255,11 @@ static int hl_fw_dynamic_read_and_validate_descriptor(struct hl_device *hdev,
memcpy_fromio(fw_desc, src, sizeof(struct lkd_fw_comms_desc));
fw_data_size = le16_to_cpu(fw_desc->header.size);
- temp_fw_desc = vzalloc(sizeof(struct comms_desc_header) + fw_data_size);
+ temp_fw_desc = vzalloc(sizeof(struct comms_msg_header) + fw_data_size);
if (!temp_fw_desc)
return -ENOMEM;
- memcpy_fromio(temp_fw_desc, src, sizeof(struct comms_desc_header) + fw_data_size);
+ memcpy_fromio(temp_fw_desc, src, sizeof(struct comms_msg_header) + fw_data_size);
rc = hl_fw_dynamic_validate_descriptor(hdev, fw_loader,
(struct lkd_fw_comms_desc *) temp_fw_desc);
@@ -2740,17 +2781,20 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
}
+ rc = hl_fw_dynamic_request_descriptor(hdev, fw_loader, sizeof(struct lkd_msg_comms));
+ if (rc)
+ goto protocol_err;
+
+ if (hdev->asic_prop.support_dynamic_resereved_fw_size)
+ hdev->asic_prop.reserved_fw_mem_size =
+ le32_to_cpu(fw_loader->dynamic_loader.comm_desc.rsvd_mem_size_mb) * SZ_1M;
+
if (!(hdev->fw_components & FW_TYPE_BOOT_CPU)) {
struct lkd_fw_binning_info *binning_info;
- rc = hl_fw_dynamic_request_descriptor(hdev, fw_loader, 0);
- if (rc)
- goto protocol_err;
-
/* read preboot version */
rc = hl_fw_dynamic_read_device_fw_version(hdev, FW_COMP_PREBOOT,
fw_loader->dynamic_loader.comm_desc.cur_fw_ver);
-
if (rc)
return rc;
@@ -2811,7 +2855,7 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
hdev->asic_funcs->init_cpu_scrambler_dram(hdev);
if (!(hdev->fw_components & FW_TYPE_LINUX)) {
- dev_info(hdev->dev, "Skip loading Linux F/W\n");
+ dev_dbg(hdev->dev, "Skip loading Linux F/W\n");
return 0;
}
@@ -3141,10 +3185,10 @@ long hl_fw_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr)
pkt.pll_index = cpu_to_le32((u32)used_pll_idx);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, &result);
-
if (rc) {
- dev_err(hdev->dev, "Failed to get frequency of PLL %d, error %d\n",
- used_pll_idx, rc);
+ if (rc != -EAGAIN)
+ dev_err(hdev->dev, "Failed to get frequency of PLL %d, error %d\n",
+ used_pll_idx, rc);
return rc;
}
@@ -3168,8 +3212,7 @@ void hl_fw_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq)
pkt.value = cpu_to_le64(freq);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL);
-
- if (rc)
+ if (rc && rc != -EAGAIN)
dev_err(hdev->dev, "Failed to set frequency to PLL %d, error %d\n",
used_pll_idx, rc);
}
@@ -3185,9 +3228,9 @@ long hl_fw_get_max_power(struct hl_device *hdev)
pkt.ctl = cpu_to_le32(CPUCP_PACKET_MAX_POWER_GET << CPUCP_PKT_CTL_OPCODE_SHIFT);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, &result);
-
if (rc) {
- dev_err(hdev->dev, "Failed to get max power, error %d\n", rc);
+ if (rc != -EAGAIN)
+ dev_err(hdev->dev, "Failed to get max power, error %d\n", rc);
return rc;
}
@@ -3209,8 +3252,7 @@ void hl_fw_set_max_power(struct hl_device *hdev)
pkt.value = cpu_to_le64(hdev->max_power);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL);
-
- if (rc)
+ if (rc && rc != -EAGAIN)
dev_err(hdev->dev, "Failed to set max power, error %d\n", rc);
}
@@ -3236,11 +3278,11 @@ static int hl_fw_get_sec_attest_data(struct hl_device *hdev, u32 packet_id, void
pkt.data_max_size = cpu_to_le32(size);
pkt.nonce = cpu_to_le32(nonce);
- rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
- timeout, NULL);
+ rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), timeout, NULL);
if (rc) {
- dev_err(hdev->dev,
- "Failed to handle CPU-CP pkt %u, error %d\n", packet_id, rc);
+ if (rc != -EAGAIN)
+ dev_err(hdev->dev,
+ "Failed to handle CPU-CP pkt %u, error %d\n", packet_id, rc);
goto out;
}
@@ -3260,6 +3302,14 @@ int hl_fw_get_sec_attest_info(struct hl_device *hdev, struct cpucp_sec_attest_in
HL_CPUCP_SEC_ATTEST_INFO_TINEOUT_USEC);
}
+int hl_fw_get_dev_info_signed(struct hl_device *hdev,
+ struct cpucp_dev_info_signed *dev_info_signed, u32 nonce)
+{
+ return hl_fw_get_sec_attest_data(hdev, CPUCP_PACKET_INFO_SIGNED_GET, dev_info_signed,
+ sizeof(struct cpucp_dev_info_signed), nonce,
+ HL_CPUCP_SEC_ATTEST_INFO_TINEOUT_USEC);
+}
+
int hl_fw_send_generic_request(struct hl_device *hdev, enum hl_passthrough_type sub_opcode,
dma_addr_t buff, u32 *size)
{
@@ -3274,10 +3324,12 @@ int hl_fw_send_generic_request(struct hl_device *hdev, enum hl_passthrough_type
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *)&pkt, sizeof(pkt),
HL_CPUCP_INFO_TIMEOUT_USEC, &result);
- if (rc)
- dev_err(hdev->dev, "failed to send CPUCP data of generic fw pkt\n");
- else
+ if (rc) {
+ if (rc != -EAGAIN)
+ dev_err(hdev->dev, "failed to send CPUCP data of generic fw pkt\n");
+ } else {
dev_dbg(hdev->dev, "generic pkt was successful, result: 0x%llx\n", result);
+ }
*size = (u32)result;