diff options
Diffstat (limited to 'drivers/accel/habanalabs/common/firmware_if.c')
| -rw-r--r-- | drivers/accel/habanalabs/common/firmware_if.c | 625 |
1 files changed, 394 insertions, 231 deletions
diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c index da892d8fb3d6..eeb6b2a80fc7 100644 --- a/drivers/accel/habanalabs/common/firmware_if.c +++ b/drivers/accel/habanalabs/common/firmware_if.c @@ -6,8 +6,9 @@ */ #include "habanalabs.h" -#include "../include/common/hl_boot_if.h" +#include <linux/habanalabs/hl_boot_if.h> +#include <linux/pci.h> #include <linux/firmware.h> #include <linux/crc32.h> #include <linux/slab.h> @@ -40,6 +41,31 @@ static char *comms_sts_str_arr[COMMS_STS_INVLD_LAST] = { [COMMS_STS_TIMEOUT_ERR] = __stringify(COMMS_STS_TIMEOUT_ERR), }; +/** + * hl_fw_version_cmp() - compares the FW version to a specific version + * + * @hdev: pointer to hl_device structure + * @major: major number of a reference version + * @minor: minor number of a reference version + * @subminor: sub-minor number of a reference version + * + * Return 1 if FW version greater than the reference version, -1 if it's + * smaller and 0 if versions are identical. + */ +int hl_fw_version_cmp(struct hl_device *hdev, u32 major, u32 minor, u32 subminor) +{ + if (hdev->fw_sw_major_ver != major) + return (hdev->fw_sw_major_ver > major) ? 1 : -1; + + if (hdev->fw_sw_minor_ver != minor) + return (hdev->fw_sw_minor_ver > minor) ? 1 : -1; + + if (hdev->fw_sw_sub_minor_ver != subminor) + return (hdev->fw_sw_sub_minor_ver > subminor) ? 1 : -1; + + return 0; +} + static char *extract_fw_ver_from_str(const char *fw_str) { char *str, *fw_ver, *whitespace; @@ -71,41 +97,124 @@ free_fw_ver: return NULL; } -static int extract_fw_sub_versions(struct hl_device *hdev, char *preboot_ver) +/** + * extract_u32_until_given_char() - given a string of the format "<u32><char>*", extract the u32. + * @str: the given string + * @ver_num: the pointer to the extracted u32 to be returned to the caller. + * @given_char: the given char at the end of the u32 in the string + * + * Return: Upon success, return a pointer to the given_char in the string. Upon failure, return NULL + */ +static char *extract_u32_until_given_char(char *str, u32 *ver_num, char given_char) { - char major[8], minor[8], *first_dot, *second_dot; - int rc; + char num_str[8] = {}, *ch; - first_dot = strnstr(preboot_ver, ".", 10); - if (first_dot) { - strscpy(major, preboot_ver, first_dot - preboot_ver + 1); - rc = kstrtou32(major, 10, &hdev->fw_major_version); - } else { - rc = -EINVAL; - } + ch = strchrnul(str, given_char); + if (*ch == '\0' || ch == str || ch - str >= sizeof(num_str)) + return NULL; - if (rc) { - dev_err(hdev->dev, "Error %d parsing preboot major version\n", rc); - goto out; + memcpy(num_str, str, ch - str); + if (kstrtou32(num_str, 10, ver_num)) + return NULL; + return ch; +} + +/** + * hl_get_sw_major_minor_subminor() - extract the FW's SW version major, minor, sub-minor + * from the version string + * @hdev: pointer to the hl_device + * @fw_str: the FW's version string + * + * The extracted version is set in the hdev fields: fw_sw_{major/minor/sub_minor}_ver. + * + * fw_str is expected to have one of two possible formats, examples: + * 1) 'Preboot version hl-gaudi2-1.9.0-fw-42.0.1-sec-3' + * 2) 'Preboot version hl-gaudi2-1.9.0-rc-fw-42.0.1-sec-3' + * In those examples, the SW major,minor,subminor are correspondingly: 1,9,0. + * + * Return: 0 for success or a negative error code for failure. + */ +static int hl_get_sw_major_minor_subminor(struct hl_device *hdev, const char *fw_str) +{ + char *end, *start; + + end = strnstr(fw_str, "-rc-", VERSION_MAX_LEN); + if (end == fw_str) + return -EINVAL; + + if (!end) + end = strnstr(fw_str, "-fw-", VERSION_MAX_LEN); + + if (end == fw_str) + return -EINVAL; + + if (!end) + return -EINVAL; + + for (start = end - 1; start != fw_str; start--) { + if (*start == '-') + break; } - /* skip the first dot */ - first_dot++; + if (start == fw_str) + return -EINVAL; - second_dot = strnstr(first_dot, ".", 10); - if (second_dot) { - strscpy(minor, first_dot, second_dot - first_dot + 1); - rc = kstrtou32(minor, 10, &hdev->fw_minor_version); - } else { - rc = -EINVAL; + /* start/end point each to the starting and ending hyphen of the sw version e.g. -1.9.0- */ + start++; + start = extract_u32_until_given_char(start, &hdev->fw_sw_major_ver, '.'); + if (!start) + goto err_zero_ver; + + start++; + start = extract_u32_until_given_char(start, &hdev->fw_sw_minor_ver, '.'); + if (!start) + goto err_zero_ver; + + start++; + start = extract_u32_until_given_char(start, &hdev->fw_sw_sub_minor_ver, '-'); + if (!start) + goto err_zero_ver; + + return 0; + +err_zero_ver: + hdev->fw_sw_major_ver = 0; + hdev->fw_sw_minor_ver = 0; + hdev->fw_sw_sub_minor_ver = 0; + return -EINVAL; +} + +/** + * hl_get_preboot_major_minor() - extract the FW's version major, minor from the version string. + * @hdev: pointer to the hl_device + * @preboot_ver: the FW's version string + * + * preboot_ver is expected to be the format of <major>.<minor>.<sub minor>*, e.g: 42.0.1-sec-3 + * The extracted version is set in the hdev fields: fw_inner_{major/minor}_ver. + * + * Return: 0 on success, negative error code for failure. + */ +static int hl_get_preboot_major_minor(struct hl_device *hdev, char *preboot_ver) +{ + preboot_ver = extract_u32_until_given_char(preboot_ver, &hdev->fw_inner_major_ver, '.'); + if (!preboot_ver) { + dev_err(hdev->dev, "Error parsing preboot major version\n"); + goto err_zero_ver; } - if (rc) - dev_err(hdev->dev, "Error %d parsing preboot minor version\n", rc); + preboot_ver++; -out: - kfree(preboot_ver); - return rc; + preboot_ver = extract_u32_until_given_char(preboot_ver, &hdev->fw_inner_minor_ver, '.'); + if (!preboot_ver) { + dev_err(hdev->dev, "Error parsing preboot minor version\n"); + goto err_zero_ver; + } + return 0; + +err_zero_ver: + hdev->fw_inner_major_ver = 0; + hdev->fw_inner_minor_ver = 0; + return -EINVAL; } static int hl_request_fw(struct hl_device *hdev, @@ -262,43 +371,63 @@ int hl_fw_load_fw_to_device(struct hl_device *hdev, const char *fw_name, int hl_fw_send_pci_access_msg(struct hl_device *hdev, u32 opcode, u64 value) { struct cpucp_packet pkt = {}; + int rc; pkt.ctl = cpu_to_le32(opcode << CPUCP_PKT_CTL_OPCODE_SHIFT); pkt.value = cpu_to_le64(value); - return hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL); + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL); + if (rc) + dev_err(hdev->dev, "Failed to disable FW's PCI access\n"); + + return rc; } +/** + * hl_fw_send_cpu_message() - send CPU message to the device. + * + * @hdev: pointer to hl_device structure. + * @hw_queue_id: HW queue ID + * @msg: raw data of the message/packet + * @size: size of @msg in bytes + * @timeout_us: timeout in usec to wait for CPU reply on the message + * @result: return code reported by FW + * + * send message to the device CPU. + * + * Return: 0 on success, non-zero for failure. + * -ENOMEM: memory allocation failure + * -EAGAIN: CPU is disabled (try again when enabled) + * -ETIMEDOUT: timeout waiting for FW response + * -EIO: protocol error + */ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg, - u16 len, u32 timeout, u64 *result) + u16 size, u32 timeout_us, u64 *result) { struct hl_hw_queue *queue = &hdev->kernel_queues[hw_queue_id]; struct asic_fixed_properties *prop = &hdev->asic_prop; + u32 tmp, expected_ack_val, pi, opcode; struct cpucp_packet *pkt; dma_addr_t pkt_dma_addr; struct hl_bd *sent_bd; - u32 tmp, expected_ack_val, pi, opcode; - int rc; + int rc = 0, fw_rc; - pkt = hl_cpu_accessible_dma_pool_alloc(hdev, len, &pkt_dma_addr); + pkt = hl_cpu_accessible_dma_pool_alloc(hdev, size, &pkt_dma_addr); if (!pkt) { - dev_err(hdev->dev, - "Failed to allocate DMA memory for packet to CPU\n"); + dev_err(hdev->dev, "Failed to allocate DMA memory for packet to CPU\n"); return -ENOMEM; } - memcpy(pkt, msg, len); + memcpy(pkt, msg, size); mutex_lock(&hdev->send_cpu_message_lock); /* CPU-CP messages can be sent during soft-reset */ - if (hdev->disabled && !hdev->reset_info.in_compute_reset) { - rc = 0; + if (hdev->disabled && !hdev->reset_info.in_compute_reset) goto out; - } if (hdev->device_cpu_disabled) { - rc = -EIO; + rc = -EAGAIN; goto out; } @@ -314,7 +443,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg, * Which means that we don't need to lock the access to the entire H/W * queues module when submitting a JOB to the CPU queue. */ - hl_hw_queue_submit_bd(hdev, queue, hl_queue_inc_ptr(queue->pi), len, pkt_dma_addr); + hl_hw_queue_submit_bd(hdev, queue, hl_queue_inc_ptr(queue->pi), size, pkt_dma_addr); if (prop->fw_app_cpu_boot_dev_sts0 & CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN) expected_ack_val = queue->pi; @@ -323,7 +452,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg, rc = hl_poll_timeout_memory(hdev, &pkt->fence, tmp, (tmp == expected_ack_val), 1000, - timeout, true); + timeout_us, true); hl_hw_queue_inc_ci_kernel(hdev, hw_queue_id); @@ -331,19 +460,27 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg, /* If FW performed reset just before sending it a packet, we will get a timeout. * This is expected behavior, hence no need for error message. */ - if (!hl_device_operational(hdev, NULL) && !hdev->reset_info.in_compute_reset) + if (!hl_device_operational(hdev, NULL) && !hdev->reset_info.in_compute_reset) { dev_dbg(hdev->dev, "Device CPU packet timeout (0x%x) due to FW reset\n", tmp); - else - dev_err(hdev->dev, "Device CPU packet timeout (status = 0x%x)\n", tmp); + } else { + struct hl_bd *bd = queue->kernel_address; + + bd += hl_pi_2_offset(pi); + + dev_err(hdev->dev, "Device CPU packet timeout (status = 0x%x)\n" + "Pkt info[%u]: dma_addr: 0x%llx, kernel_addr: %p, len:0x%x, ctl: 0x%x, ptr:0x%llx, dram_bd:%u\n", + tmp, pi, pkt_dma_addr, (void *)pkt, bd->len, bd->ctl, bd->ptr, + queue->dram_bd); + } hdev->device_cpu_disabled = true; goto out; } tmp = le32_to_cpu(pkt->ctl); - rc = (tmp & CPUCP_PKT_CTL_RC_MASK) >> CPUCP_PKT_CTL_RC_SHIFT; - if (rc) { + fw_rc = (tmp & CPUCP_PKT_CTL_RC_MASK) >> CPUCP_PKT_CTL_RC_SHIFT; + if (fw_rc) { opcode = (tmp & CPUCP_PKT_CTL_OPCODE_MASK) >> CPUCP_PKT_CTL_OPCODE_SHIFT; if (!prop->supports_advanced_cpucp_rc) { @@ -352,7 +489,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg, goto scrub_descriptor; } - switch (rc) { + switch (fw_rc) { case cpucp_packet_invalid: dev_err(hdev->dev, "CPU packet %d is not supported by F/W\n", opcode); @@ -377,7 +514,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg, /* propagate the return code from the f/w to the callers who want to check it */ if (result) - *result = rc; + *result = fw_rc; rc = -EIO; @@ -397,7 +534,7 @@ scrub_descriptor: out: mutex_unlock(&hdev->send_cpu_message_lock); - hl_cpu_accessible_dma_pool_free(hdev, len, pkt); + hl_cpu_accessible_dma_pool_free(hdev, size, pkt); return rc; } @@ -418,7 +555,7 @@ int hl_fw_unmask_irq(struct hl_device *hdev, u16 event_type) 0, &result); if (rc) - dev_err(hdev->dev, "failed to unmask RAZWI IRQ %d", event_type); + dev_err(hdev->dev, "failed to unmask event %d", event_type); return rc; } @@ -457,7 +594,7 @@ int hl_fw_unmask_irq_arr(struct hl_device *hdev, const u32 *irq_arr, total_pkt_size, 0, &result); if (rc) - dev_err(hdev->dev, "failed to unmask IRQ array\n"); + dev_err(hdev->dev, "failed to unmask event array\n"); kfree(pkt); @@ -467,7 +604,7 @@ int hl_fw_unmask_irq_arr(struct hl_device *hdev, const u32 *irq_arr, int hl_fw_test_cpu_queue(struct hl_device *hdev) { struct cpucp_packet test_pkt = {}; - u64 result; + u64 result = 0; int rc; test_pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEST << @@ -508,6 +645,20 @@ void hl_fw_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size, size); } +int hl_fw_send_soft_reset(struct hl_device *hdev) +{ + struct cpucp_packet pkt; + int rc; + + memset(&pkt, 0, sizeof(pkt)); + pkt.ctl = cpu_to_le32(CPUCP_PACKET_SOFT_RESET << CPUCP_PKT_CTL_OPCODE_SHIFT); + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL); + if (rc) + dev_err(hdev->dev, "failed to send soft-reset msg (err = %d)\n", rc); + + return rc; +} + int hl_fw_send_device_activity(struct hl_device *hdev, bool open) { struct cpucp_packet pkt; @@ -526,16 +677,14 @@ int hl_fw_send_device_activity(struct hl_device *hdev, bool open) int hl_fw_send_heartbeat(struct hl_device *hdev) { struct cpucp_packet hb_pkt; - u64 result; + u64 result = 0; int rc; memset(&hb_pkt, 0, sizeof(hb_pkt)); - hb_pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEST << - CPUCP_PKT_CTL_OPCODE_SHIFT); + hb_pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEST << CPUCP_PKT_CTL_OPCODE_SHIFT); hb_pkt.value = cpu_to_le64(CPUCP_PACKET_FENCE_VAL); - rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &hb_pkt, - sizeof(hb_pkt), 0, &result); + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &hb_pkt, sizeof(hb_pkt), 0, &result); if ((rc) || (result != CPUCP_PACKET_FENCE_VAL)) return -EIO; @@ -546,42 +695,32 @@ int hl_fw_send_heartbeat(struct hl_device *hdev) rc = -EIO; } + hdev->heartbeat_debug_info.last_pq_heartbeat_ts = ktime_get_real_seconds(); + return rc; } -static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val, - u32 sts_val) +static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val, u32 sts_val) { bool err_exists = false; if (!(err_val & CPU_BOOT_ERR0_ENABLED)) return false; - if (err_val & CPU_BOOT_ERR0_DRAM_INIT_FAIL) { - dev_err(hdev->dev, - "Device boot error - DRAM initialization failed\n"); - err_exists = true; - } + if (err_val & CPU_BOOT_ERR0_DRAM_INIT_FAIL) + dev_err(hdev->dev, "Device boot error - DRAM initialization failed\n"); - if (err_val & CPU_BOOT_ERR0_FIT_CORRUPTED) { + if (err_val & CPU_BOOT_ERR0_FIT_CORRUPTED) dev_err(hdev->dev, "Device boot error - FIT image corrupted\n"); - err_exists = true; - } - if (err_val & CPU_BOOT_ERR0_TS_INIT_FAIL) { - dev_err(hdev->dev, - "Device boot error - Thermal Sensor initialization failed\n"); - err_exists = true; - } + if (err_val & CPU_BOOT_ERR0_TS_INIT_FAIL) + dev_err(hdev->dev, "Device boot error - Thermal Sensor initialization failed\n"); if (err_val & CPU_BOOT_ERR0_BMC_WAIT_SKIPPED) { if (hdev->bmc_enable) { - dev_err(hdev->dev, - "Device boot error - Skipped waiting for BMC\n"); - err_exists = true; + dev_err(hdev->dev, "Device boot error - Skipped waiting for BMC\n"); } else { - dev_info(hdev->dev, - "Device boot message - Skipped waiting for BMC\n"); + dev_info(hdev->dev, "Device boot message - Skipped waiting for BMC\n"); /* This is an info so we don't want it to disable the * device */ @@ -589,43 +728,29 @@ static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val, } } - if (err_val & CPU_BOOT_ERR0_NIC_DATA_NOT_RDY) { - dev_err(hdev->dev, - "Device boot error - Serdes data from BMC not available\n"); - err_exists = true; - } + if (err_val & CPU_BOOT_ERR0_NIC_DATA_NOT_RDY) + dev_err(hdev->dev, "Device boot error - Serdes data from BMC not available\n"); - if (err_val & CPU_BOOT_ERR0_NIC_FW_FAIL) { - dev_err(hdev->dev, - "Device boot error - NIC F/W initialization failed\n"); - err_exists = true; - } + if (err_val & CPU_BOOT_ERR0_NIC_FW_FAIL) + dev_err(hdev->dev, "Device boot error - NIC F/W initialization failed\n"); - if (err_val & CPU_BOOT_ERR0_SECURITY_NOT_RDY) { - dev_err(hdev->dev, - "Device boot warning - security not ready\n"); - err_exists = true; - } + if (err_val & CPU_BOOT_ERR0_SECURITY_NOT_RDY) + dev_err(hdev->dev, "Device boot warning - security not ready\n"); - if (err_val & CPU_BOOT_ERR0_SECURITY_FAIL) { + if (err_val & CPU_BOOT_ERR0_SECURITY_FAIL) dev_err(hdev->dev, "Device boot error - security failure\n"); - err_exists = true; - } - if (err_val & CPU_BOOT_ERR0_EFUSE_FAIL) { + if (err_val & CPU_BOOT_ERR0_EFUSE_FAIL) dev_err(hdev->dev, "Device boot error - eFuse failure\n"); - err_exists = true; - } - if (err_val & CPU_BOOT_ERR0_SEC_IMG_VER_FAIL) { + if (err_val & CPU_BOOT_ERR0_SEC_IMG_VER_FAIL) dev_err(hdev->dev, "Device boot error - Failed to load preboot secondary image\n"); - err_exists = true; - } - if (err_val & CPU_BOOT_ERR0_PLL_FAIL) { + if (err_val & CPU_BOOT_ERR0_PLL_FAIL) dev_err(hdev->dev, "Device boot error - PLL failure\n"); - err_exists = true; - } + + if (err_val & CPU_BOOT_ERR0_TMP_THRESH_INIT_FAIL) + dev_err(hdev->dev, "Device boot error - Failed to set threshold for temperature sensor\n"); if (err_val & CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL) { /* Ignore this bit, don't prevent driver loading */ @@ -633,52 +758,32 @@ static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val, err_val &= ~CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL; } - if (err_val & CPU_BOOT_ERR0_BINNING_FAIL) { + if (err_val & CPU_BOOT_ERR0_BINNING_FAIL) dev_err(hdev->dev, "Device boot error - binning failure\n"); - err_exists = true; - } if (sts_val & CPU_BOOT_DEV_STS0_ENABLED) dev_dbg(hdev->dev, "Device status0 %#x\n", sts_val); + if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED) + dev_err(hdev->dev, "Device boot warning - Skipped DRAM initialization\n"); + + if (err_val & CPU_BOOT_ERR_ENG_ARC_MEM_SCRUB_FAIL) + dev_err(hdev->dev, "Device boot error - ARC memory scrub failed\n"); + + /* All warnings should go here in order not to reach the unknown error validation */ if (err_val & CPU_BOOT_ERR0_EEPROM_FAIL) { dev_err(hdev->dev, "Device boot error - EEPROM failure detected\n"); err_exists = true; } - /* All warnings should go here in order not to reach the unknown error validation */ - if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED) { - dev_warn(hdev->dev, - "Device boot warning - Skipped DRAM initialization\n"); - /* This is a warning so we don't want it to disable the - * device - */ - err_val &= ~CPU_BOOT_ERR0_DRAM_SKIPPED; - } + if (err_val & CPU_BOOT_ERR0_PRI_IMG_VER_FAIL) + dev_warn(hdev->dev, "Device boot warning - Failed to load preboot primary image\n"); - if (err_val & CPU_BOOT_ERR0_PRI_IMG_VER_FAIL) { - dev_warn(hdev->dev, - "Device boot warning - Failed to load preboot primary image\n"); - /* This is a warning so we don't want it to disable the - * device as we have a secondary preboot image - */ - err_val &= ~CPU_BOOT_ERR0_PRI_IMG_VER_FAIL; - } - - if (err_val & CPU_BOOT_ERR0_TPM_FAIL) { - dev_warn(hdev->dev, - "Device boot warning - TPM failure\n"); - /* This is a warning so we don't want it to disable the - * device - */ - err_val &= ~CPU_BOOT_ERR0_TPM_FAIL; - } + if (err_val & CPU_BOOT_ERR0_TPM_FAIL) + dev_warn(hdev->dev, "Device boot warning - TPM failure\n"); - if (!err_exists && (err_val & ~CPU_BOOT_ERR0_ENABLED)) { - dev_err(hdev->dev, - "Device boot error - unknown ERR0 error 0x%08x\n", err_val); + if (err_val & CPU_BOOT_ERR_FATAL_MASK) err_exists = true; - } /* return error only if it's in the predefined mask */ if (err_exists && ((err_val & ~CPU_BOOT_ERR0_ENABLED) & @@ -834,7 +939,7 @@ static int hl_fw_send_msi_info_msg(struct hl_device *hdev) { struct cpucp_array_data_packet *pkt; size_t total_pkt_size, data_size; - u64 result; + u64 result = 0; int rc; /* skip sending this info for unsupported ASICs */ @@ -925,11 +1030,10 @@ int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size) rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), HL_CPUCP_EEPROM_TIMEOUT_USEC, &result); - if (rc) { - dev_err(hdev->dev, - "Failed to handle CPU-CP EEPROM packet, error %d\n", - rc); + if (rc != -EAGAIN) + dev_err(hdev->dev, + "Failed to handle CPU-CP EEPROM packet, error %d\n", rc); goto out; } @@ -970,7 +1074,9 @@ int hl_fw_get_monitor_dump(struct hl_device *hdev, void *data) rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), HL_CPUCP_MON_DUMP_TIMEOUT_USEC, &result); if (rc) { - dev_err(hdev->dev, "Failed to handle CPU-CP monitor-dump packet, error %d\n", rc); + if (rc != -EAGAIN) + dev_err(hdev->dev, + "Failed to handle CPU-CP monitor-dump packet, error %d\n", rc); goto out; } @@ -1004,8 +1110,9 @@ int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev, rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), HL_CPUCP_INFO_TIMEOUT_USEC, &result); if (rc) { - dev_err(hdev->dev, - "Failed to handle CPU-CP PCI info pkt, error %d\n", rc); + if (rc != -EAGAIN) + dev_err(hdev->dev, + "Failed to handle CPU-CP PCI info pkt, error %d\n", rc); return rc; } counters->rx_throughput = result; @@ -1019,8 +1126,9 @@ int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev, rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), HL_CPUCP_INFO_TIMEOUT_USEC, &result); if (rc) { - dev_err(hdev->dev, - "Failed to handle CPU-CP PCI info pkt, error %d\n", rc); + if (rc != -EAGAIN) + dev_err(hdev->dev, + "Failed to handle CPU-CP PCI info pkt, error %d\n", rc); return rc; } counters->tx_throughput = result; @@ -1033,8 +1141,9 @@ int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev, rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), HL_CPUCP_INFO_TIMEOUT_USEC, &result); if (rc) { - dev_err(hdev->dev, - "Failed to handle CPU-CP PCI info pkt, error %d\n", rc); + if (rc != -EAGAIN) + dev_err(hdev->dev, + "Failed to handle CPU-CP PCI info pkt, error %d\n", rc); return rc; } counters->replay_cnt = (u32) result; @@ -1054,9 +1163,9 @@ int hl_fw_cpucp_total_energy_get(struct hl_device *hdev, u64 *total_energy) rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), HL_CPUCP_INFO_TIMEOUT_USEC, &result); if (rc) { - dev_err(hdev->dev, - "Failed to handle CpuCP total energy pkt, error %d\n", - rc); + if (rc != -EAGAIN) + dev_err(hdev->dev, + "Failed to handle CpuCP total energy pkt, error %d\n", rc); return rc; } @@ -1132,7 +1241,8 @@ int hl_fw_cpucp_pll_info_get(struct hl_device *hdev, u32 pll_index, rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), HL_CPUCP_INFO_TIMEOUT_USEC, &result); if (rc) { - dev_err(hdev->dev, "Failed to read PLL info, error %d\n", rc); + if (rc != -EAGAIN) + dev_err(hdev->dev, "Failed to read PLL info, error %d\n", rc); return rc; } @@ -1159,7 +1269,8 @@ int hl_fw_cpucp_power_get(struct hl_device *hdev, u64 *power) rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), HL_CPUCP_INFO_TIMEOUT_USEC, &result); if (rc) { - dev_err(hdev->dev, "Failed to read power, error %d\n", rc); + if (rc != -EAGAIN) + dev_err(hdev->dev, "Failed to read power, error %d\n", rc); return rc; } @@ -1196,8 +1307,9 @@ int hl_fw_dram_replaced_row_get(struct hl_device *hdev, rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), HL_CPUCP_INFO_TIMEOUT_USEC, &result); if (rc) { - dev_err(hdev->dev, - "Failed to handle CPU-CP replaced rows info pkt, error %d\n", rc); + if (rc != -EAGAIN) + dev_err(hdev->dev, + "Failed to handle CPU-CP replaced rows info pkt, error %d\n", rc); goto out; } @@ -1222,7 +1334,8 @@ int hl_fw_dram_pending_row_get(struct hl_device *hdev, u32 *pend_rows_num) rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, &result); if (rc) { - dev_err(hdev->dev, + if (rc != -EAGAIN) + dev_err(hdev->dev, "Failed to handle CPU-CP pending rows info pkt, error %d\n", rc); goto out; } @@ -1263,7 +1376,7 @@ void hl_fw_ask_hard_reset_without_linux(struct hl_device *hdev) COMMS_RST_DEV, 0, false, hdev->fw_loader.cpu_timeout); if (rc) - dev_warn(hdev->dev, "Failed sending COMMS_RST_DEV\n"); + dev_err(hdev->dev, "Failed sending COMMS_RST_DEV\n"); } else { WREG32(static_loader->kmd_msg_to_cpu_reg, KMD_MSG_RST_DEV); } @@ -1271,8 +1384,10 @@ void hl_fw_ask_hard_reset_without_linux(struct hl_device *hdev) void hl_fw_ask_halt_machine_without_linux(struct hl_device *hdev) { - struct static_fw_load_mgr *static_loader = - &hdev->fw_loader.static_loader; + struct fw_load_mgr *fw_loader = &hdev->fw_loader; + u32 status, cpu_boot_status_reg, cpu_timeout; + struct static_fw_load_mgr *static_loader; + struct pre_fw_load_props *pre_fw_load; int rc; if (hdev->device_cpu_is_halted) @@ -1280,12 +1395,28 @@ void hl_fw_ask_halt_machine_without_linux(struct hl_device *hdev) /* Stop device CPU to make sure nothing bad happens */ if (hdev->asic_prop.dynamic_fw_load) { + pre_fw_load = &fw_loader->pre_fw_load; + cpu_timeout = fw_loader->cpu_timeout; + cpu_boot_status_reg = pre_fw_load->cpu_boot_status_reg; + rc = hl_fw_dynamic_send_protocol_cmd(hdev, &hdev->fw_loader, - COMMS_GOTO_WFE, 0, true, - hdev->fw_loader.cpu_timeout); - if (rc) - dev_warn(hdev->dev, "Failed sending COMMS_GOTO_WFE\n"); + COMMS_GOTO_WFE, 0, false, cpu_timeout); + if (rc) { + dev_err(hdev->dev, "Failed sending COMMS_GOTO_WFE\n"); + } else { + rc = hl_poll_timeout( + hdev, + cpu_boot_status_reg, + status, + status == CPU_BOOT_STATUS_IN_WFE, + hdev->fw_poll_interval_usec, + cpu_timeout); + if (rc) + dev_err(hdev->dev, "Current status=%u. Timed-out updating to WFE\n", + status); + } } else { + static_loader = &hdev->fw_loader.static_loader; WREG32(static_loader->kmd_msg_to_cpu_reg, KMD_MSG_GOTO_WFE); msleep(static_loader->cpu_reset_wait_msec); @@ -1344,6 +1475,10 @@ static void detect_cpu_boot_status(struct hl_device *hdev, u32 status) dev_err(hdev->dev, "Device boot progress - Stuck in preboot after security initialization\n"); break; + case CPU_BOOT_STATUS_FW_SHUTDOWN_PREP: + dev_err(hdev->dev, + "Device boot progress - Stuck in preparation for shutdown\n"); + break; default: dev_err(hdev->dev, "Device boot progress - Invalid or unexpected status code %d\n", status); @@ -1354,8 +1489,9 @@ static void detect_cpu_boot_status(struct hl_device *hdev, u32 status) int hl_fw_wait_preboot_ready(struct hl_device *hdev) { struct pre_fw_load_props *pre_fw_load = &hdev->fw_loader.pre_fw_load; - u32 status; - int rc; + u32 status = 0, timeout; + int rc, tries = 1, fw_err = 0; + bool preboot_still_runs; /* Need to check two possible scenarios: * @@ -1365,6 +1501,8 @@ int hl_fw_wait_preboot_ready(struct hl_device *hdev) * All other status values - for older firmwares where the uboot was * loaded from the FLASH */ + timeout = pre_fw_load->wait_for_preboot_timeout; +retry: rc = hl_poll_timeout( hdev, pre_fw_load->cpu_boot_status_reg, @@ -1373,20 +1511,37 @@ int hl_fw_wait_preboot_ready(struct hl_device *hdev) (status == CPU_BOOT_STATUS_READY_TO_BOOT) || (status == CPU_BOOT_STATUS_WAITING_FOR_BOOT_FIT), hdev->fw_poll_interval_usec, - pre_fw_load->wait_for_preboot_timeout); + timeout); + /* + * if F/W reports "security-ready" it means preboot might take longer. + * If the field 'wait_for_preboot_extended_timeout' is non 0 we wait again + * with that timeout + */ + preboot_still_runs = (status == CPU_BOOT_STATUS_SECURITY_READY || + status == CPU_BOOT_STATUS_IN_PREBOOT || + status == CPU_BOOT_STATUS_FW_SHUTDOWN_PREP || + status == CPU_BOOT_STATUS_DRAM_RDY); + + if (rc && tries && preboot_still_runs) { + tries--; + if (pre_fw_load->wait_for_preboot_extended_timeout) { + timeout = pre_fw_load->wait_for_preboot_extended_timeout; + goto retry; + } + } - if (rc) { + /* If we read all FF, then something is totally wrong, no point + * of reading specific errors + */ + if (status != -1) + fw_err = fw_read_errors(hdev, pre_fw_load->boot_err0_reg, + pre_fw_load->boot_err1_reg, + pre_fw_load->sts_boot_dev_sts0_reg, + pre_fw_load->sts_boot_dev_sts1_reg); + if (rc || fw_err) { detect_cpu_boot_status(hdev, status); - dev_err(hdev->dev, "CPU boot ready timeout (status = %d)\n", status); - - /* If we read all FF, then something is totally wrong, no point - * of reading specific errors - */ - if (status != -1) - fw_read_errors(hdev, pre_fw_load->boot_err0_reg, - pre_fw_load->boot_err1_reg, - pre_fw_load->sts_boot_dev_sts0_reg, - pre_fw_load->sts_boot_dev_sts1_reg); + dev_err(hdev->dev, "CPU boot %s (status = %d)\n", + fw_err ? "failed due to an error" : "ready timeout", status); return -EIO; } @@ -1657,7 +1812,7 @@ static void hl_fw_dynamic_send_cmd(struct hl_device *hdev, val = FIELD_PREP(COMMS_COMMAND_CMD_MASK, cmd); val |= FIELD_PREP(COMMS_COMMAND_SIZE_MASK, size); - trace_habanalabs_comms_send_cmd(hdev->dev, comms_cmd_str_arr[cmd]); + trace_habanalabs_comms_send_cmd(&hdev->pdev->dev, comms_cmd_str_arr[cmd]); WREG32(le32_to_cpu(dyn_regs->kmd_msg_to_cpu), val); } @@ -1715,7 +1870,7 @@ static int hl_fw_dynamic_wait_for_status(struct hl_device *hdev, dyn_regs = &fw_loader->dynamic_loader.comm_desc.cpu_dyn_regs; - trace_habanalabs_comms_wait_status(hdev->dev, comms_sts_str_arr[expected_status]); + trace_habanalabs_comms_wait_status(&hdev->pdev->dev, comms_sts_str_arr[expected_status]); /* Wait for expected status */ rc = hl_poll_timeout( @@ -1732,7 +1887,8 @@ static int hl_fw_dynamic_wait_for_status(struct hl_device *hdev, return -EIO; } - trace_habanalabs_comms_wait_status_done(hdev->dev, comms_sts_str_arr[expected_status]); + trace_habanalabs_comms_wait_status_done(&hdev->pdev->dev, + comms_sts_str_arr[expected_status]); /* * skip storing FW response for NOOP to preserve the actual desired @@ -1806,7 +1962,7 @@ int hl_fw_dynamic_send_protocol_cmd(struct hl_device *hdev, { int rc; - trace_habanalabs_comms_protocol_cmd(hdev->dev, comms_cmd_str_arr[cmd]); + trace_habanalabs_comms_protocol_cmd(&hdev->pdev->dev, comms_cmd_str_arr[cmd]); /* first send clear command to clean former commands */ rc = hl_fw_dynamic_send_clear_cmd(hdev, fw_loader); @@ -1945,7 +2101,7 @@ static int hl_fw_dynamic_validate_descriptor(struct hl_device *hdev, * note that no alignment/stride address issues here as all structures * are 64 bit padded. */ - data_ptr = (u8 *)fw_desc + sizeof(struct comms_desc_header); + data_ptr = (u8 *)fw_desc + sizeof(struct comms_msg_header); data_size = le16_to_cpu(fw_desc->header.size); data_crc32 = hl_fw_compat_crc32(data_ptr, data_size); @@ -2099,11 +2255,11 @@ static int hl_fw_dynamic_read_and_validate_descriptor(struct hl_device *hdev, memcpy_fromio(fw_desc, src, sizeof(struct lkd_fw_comms_desc)); fw_data_size = le16_to_cpu(fw_desc->header.size); - temp_fw_desc = vzalloc(sizeof(struct comms_desc_header) + fw_data_size); + temp_fw_desc = vzalloc(sizeof(struct comms_msg_header) + fw_data_size); if (!temp_fw_desc) return -ENOMEM; - memcpy_fromio(temp_fw_desc, src, sizeof(struct comms_desc_header) + fw_data_size); + memcpy_fromio(temp_fw_desc, src, sizeof(struct comms_msg_header) + fw_data_size); rc = hl_fw_dynamic_validate_descriptor(hdev, fw_loader, (struct lkd_fw_comms_desc *) temp_fw_desc); @@ -2154,6 +2310,7 @@ static int hl_fw_dynamic_read_device_fw_version(struct hl_device *hdev, struct asic_fixed_properties *prop = &hdev->asic_prop; char *preboot_ver, *boot_ver; char btl_ver[32]; + int rc; switch (fwc) { case FW_COMP_BOOT_FIT: @@ -2167,22 +2324,22 @@ static int hl_fw_dynamic_read_device_fw_version(struct hl_device *hdev, break; case FW_COMP_PREBOOT: strscpy(prop->preboot_ver, fw_version, VERSION_MAX_LEN); - preboot_ver = strnstr(prop->preboot_ver, "Preboot", - VERSION_MAX_LEN); + preboot_ver = strnstr(prop->preboot_ver, "Preboot", VERSION_MAX_LEN); + dev_info(hdev->dev, "preboot full version: '%s'\n", preboot_ver); + if (preboot_ver && preboot_ver != prop->preboot_ver) { strscpy(btl_ver, prop->preboot_ver, min((int) (preboot_ver - prop->preboot_ver), 31)); dev_info(hdev->dev, "%s\n", btl_ver); } + rc = hl_get_sw_major_minor_subminor(hdev, preboot_ver); + if (rc) + return rc; preboot_ver = extract_fw_ver_from_str(prop->preboot_ver); if (preboot_ver) { - int rc; - - dev_info(hdev->dev, "preboot version %s\n", preboot_ver); - - /* This function takes care of freeing preboot_ver */ - rc = extract_fw_sub_versions(hdev, preboot_ver); + rc = hl_get_preboot_major_minor(hdev, preboot_ver); + kfree(preboot_ver); if (rc) return rc; } @@ -2370,16 +2527,6 @@ static int hl_fw_dynamic_load_image(struct hl_device *hdev, if (rc) goto release_fw; - /* update state according to boot stage */ - if (cur_fwc == FW_COMP_BOOT_FIT) { - struct cpu_dyn_regs *dyn_regs; - - dyn_regs = &fw_loader->dynamic_loader.comm_desc.cpu_dyn_regs; - hl_fw_boot_fit_update_state(hdev, - le32_to_cpu(dyn_regs->cpu_boot_dev_sts0), - le32_to_cpu(dyn_regs->cpu_boot_dev_sts1)); - } - /* copy boot fit to space allocated by FW */ rc = hl_fw_dynamic_copy_image(hdev, fw, fw_loader); if (rc) @@ -2634,17 +2781,20 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev, hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN; } + rc = hl_fw_dynamic_request_descriptor(hdev, fw_loader, sizeof(struct lkd_msg_comms)); + if (rc) + goto protocol_err; + + if (hdev->asic_prop.support_dynamic_resereved_fw_size) + hdev->asic_prop.reserved_fw_mem_size = + le32_to_cpu(fw_loader->dynamic_loader.comm_desc.rsvd_mem_size_mb) * SZ_1M; + if (!(hdev->fw_components & FW_TYPE_BOOT_CPU)) { struct lkd_fw_binning_info *binning_info; - rc = hl_fw_dynamic_request_descriptor(hdev, fw_loader, 0); - if (rc) - goto protocol_err; - /* read preboot version */ rc = hl_fw_dynamic_read_device_fw_version(hdev, FW_COMP_PREBOOT, fw_loader->dynamic_loader.comm_desc.cur_fw_ver); - if (rc) return rc; @@ -2682,6 +2832,14 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev, goto protocol_err; } + rc = hl_fw_dynamic_wait_for_boot_fit_active(hdev, fw_loader); + if (rc) + goto protocol_err; + + hl_fw_boot_fit_update_state(hdev, + le32_to_cpu(dyn_regs->cpu_boot_dev_sts0), + le32_to_cpu(dyn_regs->cpu_boot_dev_sts1)); + /* * when testing FW load (without Linux) on PLDM we don't want to * wait until boot fit is active as it may take several hours. @@ -2691,17 +2849,13 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev, if (hdev->pldm && !(hdev->fw_components & FW_TYPE_LINUX)) return 0; - rc = hl_fw_dynamic_wait_for_boot_fit_active(hdev, fw_loader); - if (rc) - goto protocol_err; - /* Enable DRAM scrambling before Linux boot and after successful * UBoot */ hdev->asic_funcs->init_cpu_scrambler_dram(hdev); if (!(hdev->fw_components & FW_TYPE_LINUX)) { - dev_info(hdev->dev, "Skip loading Linux F/W\n"); + dev_dbg(hdev->dev, "Skip loading Linux F/W\n"); return 0; } @@ -2728,7 +2882,8 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev, if (rc) goto protocol_err; - hl_fw_linux_update_state(hdev, le32_to_cpu(dyn_regs->cpu_boot_dev_sts0), + hl_fw_linux_update_state(hdev, + le32_to_cpu(dyn_regs->cpu_boot_dev_sts0), le32_to_cpu(dyn_regs->cpu_boot_dev_sts1)); hl_fw_dynamic_update_linux_interrupt_if(hdev); @@ -3030,10 +3185,10 @@ long hl_fw_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr) pkt.pll_index = cpu_to_le32((u32)used_pll_idx); rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, &result); - if (rc) { - dev_err(hdev->dev, "Failed to get frequency of PLL %d, error %d\n", - used_pll_idx, rc); + if (rc != -EAGAIN) + dev_err(hdev->dev, "Failed to get frequency of PLL %d, error %d\n", + used_pll_idx, rc); return rc; } @@ -3057,8 +3212,7 @@ void hl_fw_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq) pkt.value = cpu_to_le64(freq); rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL); - - if (rc) + if (rc && rc != -EAGAIN) dev_err(hdev->dev, "Failed to set frequency to PLL %d, error %d\n", used_pll_idx, rc); } @@ -3074,9 +3228,9 @@ long hl_fw_get_max_power(struct hl_device *hdev) pkt.ctl = cpu_to_le32(CPUCP_PACKET_MAX_POWER_GET << CPUCP_PKT_CTL_OPCODE_SHIFT); rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, &result); - if (rc) { - dev_err(hdev->dev, "Failed to get max power, error %d\n", rc); + if (rc != -EAGAIN) + dev_err(hdev->dev, "Failed to get max power, error %d\n", rc); return rc; } @@ -3098,8 +3252,7 @@ void hl_fw_set_max_power(struct hl_device *hdev) pkt.value = cpu_to_le64(hdev->max_power); rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL); - - if (rc) + if (rc && rc != -EAGAIN) dev_err(hdev->dev, "Failed to set max power, error %d\n", rc); } @@ -3125,11 +3278,11 @@ static int hl_fw_get_sec_attest_data(struct hl_device *hdev, u32 packet_id, void pkt.data_max_size = cpu_to_le32(size); pkt.nonce = cpu_to_le32(nonce); - rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), - timeout, NULL); + rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), timeout, NULL); if (rc) { - dev_err(hdev->dev, - "Failed to handle CPU-CP pkt %u, error %d\n", packet_id, rc); + if (rc != -EAGAIN) + dev_err(hdev->dev, + "Failed to handle CPU-CP pkt %u, error %d\n", packet_id, rc); goto out; } @@ -3149,10 +3302,18 @@ int hl_fw_get_sec_attest_info(struct hl_device *hdev, struct cpucp_sec_attest_in HL_CPUCP_SEC_ATTEST_INFO_TINEOUT_USEC); } +int hl_fw_get_dev_info_signed(struct hl_device *hdev, + struct cpucp_dev_info_signed *dev_info_signed, u32 nonce) +{ + return hl_fw_get_sec_attest_data(hdev, CPUCP_PACKET_INFO_SIGNED_GET, dev_info_signed, + sizeof(struct cpucp_dev_info_signed), nonce, + HL_CPUCP_SEC_ATTEST_INFO_TINEOUT_USEC); +} + int hl_fw_send_generic_request(struct hl_device *hdev, enum hl_passthrough_type sub_opcode, dma_addr_t buff, u32 *size) { - struct cpucp_packet pkt = {0}; + struct cpucp_packet pkt = {}; u64 result; int rc = 0; @@ -3163,10 +3324,12 @@ int hl_fw_send_generic_request(struct hl_device *hdev, enum hl_passthrough_type rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *)&pkt, sizeof(pkt), HL_CPUCP_INFO_TIMEOUT_USEC, &result); - if (rc) - dev_err(hdev->dev, "failed to send CPUCP data of generic fw pkt\n"); - else + if (rc) { + if (rc != -EAGAIN) + dev_err(hdev->dev, "failed to send CPUCP data of generic fw pkt\n"); + } else { dev_dbg(hdev->dev, "generic pkt was successful, result: 0x%llx\n", result); + } *size = (u32)result; |
