From f64fa332602c311a76495fd0139bd4abb9aa7bbf Mon Sep 17 00:00:00 2001 From: farah kassabri Date: Mon, 9 Oct 2023 15:07:38 +0300 Subject: accel/habanalabs: add pcie reset prepare/done hooks When working on a bare-metal system, if FLR will happen the firmware will handle it and driver will have no knowledge of it, and this will cause two issues: 1.The driver will be in operational state while it should be in reset. This will cause the heartbeat mechanism to keep sending messages to FW while pci device is in reset. Eventually heartbeat will fail and the device will end up in non-operational state. 2. After FW handles the FLR, and due to the reset it'll go back to preboot stage, and driver need to perform hard reset in order to load the boot fit binary. This patch will add reset_prepare hook that will set the device to be in disabled state, so it'll be not operational, and also reset_done hook which will be called after the actual FLR handling, then it will perform hard reset. Signed-off-by: farah kassabri Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/habanalabs_drv.c | 34 ++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/drivers/accel/habanalabs/common/habanalabs_drv.c b/drivers/accel/habanalabs/common/habanalabs_drv.c index 306a5bc9bf89..35ae0ff347f5 100644 --- a/drivers/accel/habanalabs/common/habanalabs_drv.c +++ b/drivers/accel/habanalabs/common/habanalabs_drv.c @@ -670,6 +670,38 @@ static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev) return PCI_ERS_RESULT_RECOVERED; } +static void hl_pci_reset_prepare(struct pci_dev *pdev) +{ + struct hl_device *hdev; + + hdev = pci_get_drvdata(pdev); + if (!hdev) + return; + + hdev->disabled = true; +} + +static void hl_pci_reset_done(struct pci_dev *pdev) +{ + struct hl_device *hdev; + u32 flags; + + hdev = pci_get_drvdata(pdev); + if (!hdev) + return; + + /* + * Schedule a thread to trigger hard reset. + * The reason for this handler, is for rare cases where the driver is up + * and FLR occurs. This is valid only when working with no VM, so FW handles FLR + * and resets the device. FW will go back preboot stage, so driver needs to perform + * hard reset in order to load FW fit again. + */ + flags = HL_DRV_RESET_HARD | HL_DRV_RESET_BYPASS_REQ_TO_FW; + + hl_device_reset(hdev, flags); +} + static const struct dev_pm_ops hl_pm_ops = { .suspend = hl_pmops_suspend, .resume = hl_pmops_resume, @@ -679,6 +711,8 @@ static const struct pci_error_handlers hl_pci_err_handler = { .error_detected = hl_pci_err_detected, .slot_reset = hl_pci_err_slot_reset, .resume = hl_pci_err_resume, + .reset_prepare = hl_pci_reset_prepare, + .reset_done = hl_pci_reset_done, }; static struct pci_driver hl_pci_driver = { -- cgit From fbc2a09e09201eb64b94c0c7e4d2b4ec5337f844 Mon Sep 17 00:00:00 2001 From: Farah Kassabri Date: Wed, 18 Oct 2023 16:22:13 +0300 Subject: accel/habanalabs: update device boot error check Use a predefined mask which set the device critical boot errors. Driver will fail and stop its loading, only upon detecting at least one of those errors defined in this mask. Signed-off-by: Farah Kassabri Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/firmware_if.c | 115 +++++++------------------- 1 file changed, 32 insertions(+), 83 deletions(-) diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c index 47e8384134aa..9e9dfe013659 100644 --- a/drivers/accel/habanalabs/common/firmware_if.c +++ b/drivers/accel/habanalabs/common/firmware_if.c @@ -646,39 +646,27 @@ int hl_fw_send_heartbeat(struct hl_device *hdev) return rc; } -static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val, - u32 sts_val) +static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val, u32 sts_val) { bool err_exists = false; if (!(err_val & CPU_BOOT_ERR0_ENABLED)) return false; - if (err_val & CPU_BOOT_ERR0_DRAM_INIT_FAIL) { - dev_err(hdev->dev, - "Device boot error - DRAM initialization failed\n"); - err_exists = true; - } + if (err_val & CPU_BOOT_ERR0_DRAM_INIT_FAIL) + dev_err(hdev->dev, "Device boot error - DRAM initialization failed\n"); - if (err_val & CPU_BOOT_ERR0_FIT_CORRUPTED) { + if (err_val & CPU_BOOT_ERR0_FIT_CORRUPTED) dev_err(hdev->dev, "Device boot error - FIT image corrupted\n"); - err_exists = true; - } - if (err_val & CPU_BOOT_ERR0_TS_INIT_FAIL) { - dev_err(hdev->dev, - "Device boot error - Thermal Sensor initialization failed\n"); - err_exists = true; - } + if (err_val & CPU_BOOT_ERR0_TS_INIT_FAIL) + dev_err(hdev->dev, "Device boot error - Thermal Sensor initialization failed\n"); if (err_val & CPU_BOOT_ERR0_BMC_WAIT_SKIPPED) { if (hdev->bmc_enable) { - dev_err(hdev->dev, - "Device boot error - Skipped waiting for BMC\n"); - err_exists = true; + dev_err(hdev->dev, "Device boot error - Skipped waiting for BMC\n"); } else { - dev_info(hdev->dev, - "Device boot message - Skipped waiting for BMC\n"); + dev_info(hdev->dev, "Device boot message - Skipped waiting for BMC\n"); /* This is an info so we don't want it to disable the * device */ @@ -686,48 +674,29 @@ static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val, } } - if (err_val & CPU_BOOT_ERR0_NIC_DATA_NOT_RDY) { - dev_err(hdev->dev, - "Device boot error - Serdes data from BMC not available\n"); - err_exists = true; - } + if (err_val & CPU_BOOT_ERR0_NIC_DATA_NOT_RDY) + dev_err(hdev->dev, "Device boot error - Serdes data from BMC not available\n"); - if (err_val & CPU_BOOT_ERR0_NIC_FW_FAIL) { - dev_err(hdev->dev, - "Device boot error - NIC F/W initialization failed\n"); - err_exists = true; - } + if (err_val & CPU_BOOT_ERR0_NIC_FW_FAIL) + dev_err(hdev->dev, "Device boot error - NIC F/W initialization failed\n"); - if (err_val & CPU_BOOT_ERR0_SECURITY_NOT_RDY) { - dev_err(hdev->dev, - "Device boot warning - security not ready\n"); - err_exists = true; - } + if (err_val & CPU_BOOT_ERR0_SECURITY_NOT_RDY) + dev_err(hdev->dev, "Device boot warning - security not ready\n"); - if (err_val & CPU_BOOT_ERR0_SECURITY_FAIL) { + if (err_val & CPU_BOOT_ERR0_SECURITY_FAIL) dev_err(hdev->dev, "Device boot error - security failure\n"); - err_exists = true; - } - if (err_val & CPU_BOOT_ERR0_EFUSE_FAIL) { + if (err_val & CPU_BOOT_ERR0_EFUSE_FAIL) dev_err(hdev->dev, "Device boot error - eFuse failure\n"); - err_exists = true; - } - if (err_val & CPU_BOOT_ERR0_SEC_IMG_VER_FAIL) { + if (err_val & CPU_BOOT_ERR0_SEC_IMG_VER_FAIL) dev_err(hdev->dev, "Device boot error - Failed to load preboot secondary image\n"); - err_exists = true; - } - if (err_val & CPU_BOOT_ERR0_PLL_FAIL) { + if (err_val & CPU_BOOT_ERR0_PLL_FAIL) dev_err(hdev->dev, "Device boot error - PLL failure\n"); - err_exists = true; - } - if (err_val & CPU_BOOT_ERR0_TMP_THRESH_INIT_FAIL) { + if (err_val & CPU_BOOT_ERR0_TMP_THRESH_INIT_FAIL) dev_err(hdev->dev, "Device boot error - Failed to set threshold for temperature sensor\n"); - err_exists = true; - } if (err_val & CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL) { /* Ignore this bit, don't prevent driver loading */ @@ -735,52 +704,32 @@ static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val, err_val &= ~CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL; } - if (err_val & CPU_BOOT_ERR0_BINNING_FAIL) { + if (err_val & CPU_BOOT_ERR0_BINNING_FAIL) dev_err(hdev->dev, "Device boot error - binning failure\n"); - err_exists = true; - } if (sts_val & CPU_BOOT_DEV_STS0_ENABLED) dev_dbg(hdev->dev, "Device status0 %#x\n", sts_val); + if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED) + dev_err(hdev->dev, "Device boot warning - Skipped DRAM initialization\n"); + + if (err_val & CPU_BOOT_ERR_ENG_ARC_MEM_SCRUB_FAIL) + dev_err(hdev->dev, "Device boot error - ARC memory scrub failed\n"); + + /* All warnings should go here in order not to reach the unknown error validation */ if (err_val & CPU_BOOT_ERR0_EEPROM_FAIL) { dev_err(hdev->dev, "Device boot error - EEPROM failure detected\n"); err_exists = true; } - /* All warnings should go here in order not to reach the unknown error validation */ - if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED) { - dev_warn(hdev->dev, - "Device boot warning - Skipped DRAM initialization\n"); - /* This is a warning so we don't want it to disable the - * device - */ - err_val &= ~CPU_BOOT_ERR0_DRAM_SKIPPED; - } + if (err_val & CPU_BOOT_ERR0_PRI_IMG_VER_FAIL) + dev_warn(hdev->dev, "Device boot warning - Failed to load preboot primary image\n"); - if (err_val & CPU_BOOT_ERR0_PRI_IMG_VER_FAIL) { - dev_warn(hdev->dev, - "Device boot warning - Failed to load preboot primary image\n"); - /* This is a warning so we don't want it to disable the - * device as we have a secondary preboot image - */ - err_val &= ~CPU_BOOT_ERR0_PRI_IMG_VER_FAIL; - } + if (err_val & CPU_BOOT_ERR0_TPM_FAIL) + dev_warn(hdev->dev, "Device boot warning - TPM failure\n"); - if (err_val & CPU_BOOT_ERR0_TPM_FAIL) { - dev_warn(hdev->dev, - "Device boot warning - TPM failure\n"); - /* This is a warning so we don't want it to disable the - * device - */ - err_val &= ~CPU_BOOT_ERR0_TPM_FAIL; - } - - if (!err_exists && (err_val & ~CPU_BOOT_ERR0_ENABLED)) { - dev_err(hdev->dev, - "Device boot error - unknown ERR0 error 0x%08x\n", err_val); + if (err_val & CPU_BOOT_ERR_FATAL_MASK) err_exists = true; - } /* return error only if it's in the predefined mask */ if (err_exists && ((err_val & ~CPU_BOOT_ERR0_ENABLED) & -- cgit From c6485482330d9f9a745d4bfec4dca722bd2bd75f Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Sun, 15 Oct 2023 14:04:53 +0300 Subject: accel/habanalabs/gaudi2: assume hard-reset by FW upon PCIe AXI drain When a PCIe AXI drain event happens, it is possible that the driver cannot access the device through PCIe, and therefore cannot send a hard-reset request to FW. Starting from FW version 1.13, FW will initiate a hard-reset in such a case without waiting for a reset request from the driver. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/habanalabs.h | 8 ++++++++ drivers/accel/habanalabs/gaudi2/gaudi2.c | 2 ++ 2 files changed, 10 insertions(+) diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h index 1655c101c705..5c69a482b8de 100644 --- a/drivers/accel/habanalabs/common/habanalabs.h +++ b/drivers/accel/habanalabs/common/habanalabs.h @@ -3594,6 +3594,14 @@ static inline bool hl_is_fw_sw_ver_below(struct hl_device *hdev, u32 fw_sw_major return false; } +static inline bool hl_is_fw_sw_ver_equal_or_greater(struct hl_device *hdev, u32 fw_sw_major, + u32 fw_sw_minor) +{ + return (hdev->fw_sw_major_ver > fw_sw_major || + (hdev->fw_sw_major_ver == fw_sw_major && + hdev->fw_sw_minor_ver >= fw_sw_minor)); +} + /* * Kernel module functions that can be accessed by entire module */ diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index 819660c684cf..b739078c2d87 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -10007,6 +10007,8 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent error_count = gaudi2_handle_pcie_drain(hdev, &eq_entry->pcie_drain_ind_data); reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; + if (hl_is_fw_sw_ver_equal_or_greater(hdev, 1, 13)) + is_critical = true; break; case GAUDI2_EVENT_PSOC59_RPM_ERROR_OR_DRAIN: -- cgit From e8bc0c1b1b730eb8759f5305ebd2d6876952e539 Mon Sep 17 00:00:00 2001 From: Farah Kassabri Date: Sun, 29 Oct 2023 16:16:16 +0200 Subject: accel/habanalabs: add log when eq event is not received Add error log when no eq event is received from FW, to cover a scenario when FW is stuck for some reason. In such case driver will not receive neither the eq error interrupt or the eq heartbeat event, and will just initiate a reset without indication in the dmesg about the reason. Signed-off-by: Farah Kassabri Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/device.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c index 9711e8fc979d..d95a981b2906 100644 --- a/drivers/accel/habanalabs/common/device.c +++ b/drivers/accel/habanalabs/common/device.c @@ -1049,10 +1049,12 @@ static void hl_device_eq_heartbeat(struct hl_device *hdev) if (!prop->cpucp_info.eq_health_check_supported) return; - if (hdev->eq_heartbeat_received) + if (hdev->eq_heartbeat_received) { hdev->eq_heartbeat_received = false; - else + } else { + dev_err(hdev->dev, "EQ heartbeat event was not received!\n"); hl_device_cond_reset(hdev, HL_DRV_RESET_HARD, event_mask); + } } static void hl_device_heartbeat(struct work_struct *work) -- cgit From 42422993cf28d456778ee9168d73758ec037cd51 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Mon, 30 Oct 2023 12:23:57 +0200 Subject: accel/habanalabs: add support for Gaudi2C device Gaudi2 with PCI revision ID with the value of '3' represents Gaudi2C device and should be detected and initialized as Gaudi2. Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/device.c | 3 +++ drivers/accel/habanalabs/common/habanalabs.h | 2 ++ drivers/accel/habanalabs/common/habanalabs_drv.c | 3 +++ drivers/accel/habanalabs/common/mmu/mmu.c | 1 + drivers/accel/habanalabs/common/sysfs.c | 3 +++ drivers/accel/habanalabs/include/hw_ip/pci/pci_general.h | 1 + 6 files changed, 13 insertions(+) diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c index d95a981b2906..d9447aeb3937 100644 --- a/drivers/accel/habanalabs/common/device.c +++ b/drivers/accel/habanalabs/common/device.c @@ -853,6 +853,9 @@ static int device_early_init(struct hl_device *hdev) gaudi2_set_asic_funcs(hdev); strscpy(hdev->asic_name, "GAUDI2B", sizeof(hdev->asic_name)); break; + case ASIC_GAUDI2C: + gaudi2_set_asic_funcs(hdev); + strscpy(hdev->asic_name, "GAUDI2C", sizeof(hdev->asic_name)); break; default: dev_err(hdev->dev, "Unrecognized ASIC type %d\n", diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h index 5c69a482b8de..7b0209e5bad6 100644 --- a/drivers/accel/habanalabs/common/habanalabs.h +++ b/drivers/accel/habanalabs/common/habanalabs.h @@ -1262,6 +1262,7 @@ struct hl_dec { * @ASIC_GAUDI_SEC: Gaudi secured device (HL-2000). * @ASIC_GAUDI2: Gaudi2 device. * @ASIC_GAUDI2B: Gaudi2B device. + * @ASIC_GAUDI2C: Gaudi2C device. */ enum hl_asic_type { ASIC_INVALID, @@ -1270,6 +1271,7 @@ enum hl_asic_type { ASIC_GAUDI_SEC, ASIC_GAUDI2, ASIC_GAUDI2B, + ASIC_GAUDI2C, }; struct hl_cs_parser; diff --git a/drivers/accel/habanalabs/common/habanalabs_drv.c b/drivers/accel/habanalabs/common/habanalabs_drv.c index 35ae0ff347f5..e542fd40e16c 100644 --- a/drivers/accel/habanalabs/common/habanalabs_drv.c +++ b/drivers/accel/habanalabs/common/habanalabs_drv.c @@ -141,6 +141,9 @@ static enum hl_asic_type get_asic_type(struct hl_device *hdev) case REV_ID_B: asic_type = ASIC_GAUDI2B; break; + case REV_ID_C: + asic_type = ASIC_GAUDI2C; + break; default: break; } diff --git a/drivers/accel/habanalabs/common/mmu/mmu.c b/drivers/accel/habanalabs/common/mmu/mmu.c index b2145716c605..b654302a68fc 100644 --- a/drivers/accel/habanalabs/common/mmu/mmu.c +++ b/drivers/accel/habanalabs/common/mmu/mmu.c @@ -596,6 +596,7 @@ int hl_mmu_if_set_funcs(struct hl_device *hdev) break; case ASIC_GAUDI2: case ASIC_GAUDI2B: + case ASIC_GAUDI2C: /* MMUs in Gaudi2 are always host resident */ hl_mmu_v2_hr_set_funcs(hdev, &hdev->mmu_func[MMU_HR_PGT]); break; diff --git a/drivers/accel/habanalabs/common/sysfs.c b/drivers/accel/habanalabs/common/sysfs.c index 01f89f029355..278606373055 100644 --- a/drivers/accel/habanalabs/common/sysfs.c +++ b/drivers/accel/habanalabs/common/sysfs.c @@ -251,6 +251,9 @@ static ssize_t device_type_show(struct device *dev, case ASIC_GAUDI2B: str = "GAUDI2B"; break; + case ASIC_GAUDI2C: + str = "GAUDI2C"; + break; default: dev_err(hdev->dev, "Unrecognized ASIC type %d\n", hdev->asic_type); diff --git a/drivers/accel/habanalabs/include/hw_ip/pci/pci_general.h b/drivers/accel/habanalabs/include/hw_ip/pci/pci_general.h index f5d497dc9bdc..4f951cada077 100644 --- a/drivers/accel/habanalabs/include/hw_ip/pci/pci_general.h +++ b/drivers/accel/habanalabs/include/hw_ip/pci/pci_general.h @@ -25,6 +25,7 @@ enum hl_revision_id { REV_ID_INVALID = 0x00, REV_ID_A = 0x01, REV_ID_B = 0x02, + REV_ID_C = 0x03 }; #endif /* INCLUDE_PCI_GENERAL_H_ */ -- cgit From d1958dce5ab6a3e089c60cf474e8c9b7e96e70ad Mon Sep 17 00:00:00 2001 From: Farah Kassabri Date: Tue, 31 Oct 2023 12:20:36 +0200 Subject: accel/habanalabs: fix EQ heartbeat mechanism Stop rescheduling another heartbeat check when EQ heartbeat check fails as it generates confusing logs in dmesg that the heartbeat fails. Signed-off-by: Farah Kassabri Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/device.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c index d9447aeb3937..6bf5f1d0d005 100644 --- a/drivers/accel/habanalabs/common/device.c +++ b/drivers/accel/habanalabs/common/device.c @@ -1044,20 +1044,21 @@ static bool is_pci_link_healthy(struct hl_device *hdev) return (vendor_id == PCI_VENDOR_ID_HABANALABS); } -static void hl_device_eq_heartbeat(struct hl_device *hdev) +static int hl_device_eq_heartbeat_check(struct hl_device *hdev) { - u64 event_mask = HL_NOTIFIER_EVENT_DEVICE_RESET | HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE; struct asic_fixed_properties *prop = &hdev->asic_prop; if (!prop->cpucp_info.eq_health_check_supported) - return; + return 0; if (hdev->eq_heartbeat_received) { hdev->eq_heartbeat_received = false; } else { dev_err(hdev->dev, "EQ heartbeat event was not received!\n"); - hl_device_cond_reset(hdev, HL_DRV_RESET_HARD, event_mask); + return -EIO; } + + return 0; } static void hl_device_heartbeat(struct work_struct *work) @@ -1074,10 +1075,9 @@ static void hl_device_heartbeat(struct work_struct *work) /* * For EQ health check need to check if driver received the heartbeat eq event * in order to validate the eq is working. + * Only if both the EQ is healthy and we managed to send the next heartbeat reschedule. */ - hl_device_eq_heartbeat(hdev); - - if (!hdev->asic_funcs->send_heartbeat(hdev)) + if ((!hl_device_eq_heartbeat_check(hdev)) && (!hdev->asic_funcs->send_heartbeat(hdev))) goto reschedule; if (hl_device_operational(hdev, NULL)) -- cgit From 0ec346779644039c4c05cfa7f071b1a24e54d8d9 Mon Sep 17 00:00:00 2001 From: Dafna Hirschfeld Date: Tue, 31 Oct 2023 13:51:10 +0200 Subject: accel/habanalabs/gaudi2: fix undef opcode reporting currently the undefined opcode event bit in set only for lower cp and only if 'write_enable' is true. It should be set anyway and for all streams in order to report that event to userspace. Signed-off-by: Dafna Hirschfeld Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/gaudi2/gaudi2.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index b739078c2d87..5075f92d15cc 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -7929,21 +7929,19 @@ static int gaudi2_handle_qman_err_generic(struct hl_device *hdev, u16 event_type error_count++; } - if (i == QMAN_STREAMS && error_count) { - /* check for undefined opcode */ - if (glbl_sts_val & PDMA0_QM_GLBL_ERR_STS_CP_UNDEF_CMD_ERR_MASK && - hdev->captured_err_info.undef_opcode.write_enable) { + /* check for undefined opcode */ + if (glbl_sts_val & PDMA0_QM_GLBL_ERR_STS_CP_UNDEF_CMD_ERR_MASK) { + *event_mask |= HL_NOTIFIER_EVENT_UNDEFINED_OPCODE; + if (hdev->captured_err_info.undef_opcode.write_enable) { memset(&hdev->captured_err_info.undef_opcode, 0, sizeof(hdev->captured_err_info.undef_opcode)); - - hdev->captured_err_info.undef_opcode.write_enable = false; hdev->captured_err_info.undef_opcode.timestamp = ktime_get(); hdev->captured_err_info.undef_opcode.engine_id = gaudi2_queue_id_to_engine_id[qid_base]; - *event_mask |= HL_NOTIFIER_EVENT_UNDEFINED_OPCODE; } - handle_lower_qman_data_on_err(hdev, qman_base, *event_mask); + if (i == QMAN_STREAMS) + handle_lower_qman_data_on_err(hdev, qman_base, *event_mask); } } -- cgit From 571cdb6e3b9a9c077b39047091f0ccc721b92b83 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Thu, 9 Nov 2023 10:53:10 +0200 Subject: accel/habanalabs: remove 'get temperature' debug print The print was added long back for a specific debug and can now be removed. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/hwmon.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/accel/habanalabs/common/hwmon.c b/drivers/accel/habanalabs/common/hwmon.c index 8598056216e7..1ee2ee07e9ed 100644 --- a/drivers/accel/habanalabs/common/hwmon.c +++ b/drivers/accel/habanalabs/common/hwmon.c @@ -578,10 +578,6 @@ int hl_get_temperature(struct hl_device *hdev, CPUCP_PKT_CTL_OPCODE_SHIFT); pkt.sensor_index = __cpu_to_le16(sensor_index); pkt.type = __cpu_to_le16(attr); - - dev_dbg(hdev->dev, "get temp, ctl 0x%x, sensor %d, type %d\n", - pkt.ctl, pkt.sensor_index, pkt.type); - rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, &result); -- cgit From 4b0b1fbc7757169b6d304545a321c7a88f13f8f0 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Thu, 20 Jul 2023 16:50:39 +0300 Subject: accel/habanalabs: set hard reset flag if graceful reset is skipped hl_device_cond_reset() might be called with the hard reset flag unset, because a compute reset upon device release as part of a graceful reset is valid. If the conditions for graceful reset are not met, hl_device_reset() will be called for an immediate reset. In this case a compute reset is not valid, so it will be replaced with a hard reset together with a debug message about it. This message might be confusing, as it implies that a compute reset was requested when it shouldn't. To prevent this confusion, set the hard reset flag in hl_device_cond_reset() if going to an immediate reset. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c index 6bf5f1d0d005..a365791a9f5c 100644 --- a/drivers/accel/habanalabs/common/device.c +++ b/drivers/accel/habanalabs/common/device.c @@ -2040,7 +2040,7 @@ device_reset: if (ctx) hl_ctx_put(ctx); - return hl_device_reset(hdev, flags); + return hl_device_reset(hdev, flags | HL_DRV_RESET_HARD); } static void hl_notifier_event_send(struct hl_notifier_event *notifier_event, u64 event_mask) -- cgit From ae303d885d4a0fcea65330de9327d28edfebd206 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Mon, 6 Nov 2023 18:41:35 +0200 Subject: accel/habanalabs/gaudi2: get the correct QM CQ info upon an error Upon a QM error, the address/size from both the CQ and the ARC_CQ are printed, although the instruction that led to the error was received from only one of them. Moreover, in case of a QM undefined opcode, only one of these address/size sets will be captured based on the value of ARC_CQ_PTR. However, this value can be non-zero even if currently the CQ is used, in case the CQ/ARC_CQ are alternately used. Under the assumption of having a stop-on-error configuration, modify to use CP_STS.CUR_CQ field to get the relevant CQ for the QM error. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/gaudi2/gaudi2.c | 44 +++++++++++----------- .../include/gaudi2/asic_reg/gaudi2_regs.h | 1 + 2 files changed, 23 insertions(+), 22 deletions(-) diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index 5075f92d15cc..77c480725a84 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -7860,36 +7860,36 @@ static bool gaudi2_handle_ecc_event(struct hl_device *hdev, u16 event_type, static void handle_lower_qman_data_on_err(struct hl_device *hdev, u64 qman_base, u64 event_mask) { - u32 lo, hi, cq_ptr_size, arc_cq_ptr_size; - u64 cq_ptr, arc_cq_ptr, cp_current_inst; - - lo = RREG32(qman_base + QM_CQ_PTR_LO_4_OFFSET); - hi = RREG32(qman_base + QM_CQ_PTR_HI_4_OFFSET); - cq_ptr = ((u64) hi) << 32 | lo; - cq_ptr_size = RREG32(qman_base + QM_CQ_TSIZE_4_OFFSET); - - lo = RREG32(qman_base + QM_ARC_CQ_PTR_LO_OFFSET); - hi = RREG32(qman_base + QM_ARC_CQ_PTR_HI_OFFSET); - arc_cq_ptr = ((u64) hi) << 32 | lo; - arc_cq_ptr_size = RREG32(qman_base + QM_ARC_CQ_TSIZE_OFFSET); + u32 lo, hi, cq_ptr_size, cp_sts; + u64 cq_ptr, cp_current_inst; + bool is_arc_cq; + + cp_sts = RREG32(qman_base + QM_CP_STS_4_OFFSET); + is_arc_cq = FIELD_GET(PDMA0_QM_CP_STS_CUR_CQ_MASK, cp_sts); /* 0 - legacy CQ, 1 - ARC_CQ */ + + if (is_arc_cq) { + lo = RREG32(qman_base + QM_ARC_CQ_PTR_LO_OFFSET); + hi = RREG32(qman_base + QM_ARC_CQ_PTR_HI_OFFSET); + cq_ptr = ((u64) hi) << 32 | lo; + cq_ptr_size = RREG32(qman_base + QM_ARC_CQ_TSIZE_OFFSET); + } else { + lo = RREG32(qman_base + QM_CQ_PTR_LO_4_OFFSET); + hi = RREG32(qman_base + QM_CQ_PTR_HI_4_OFFSET); + cq_ptr = ((u64) hi) << 32 | lo; + cq_ptr_size = RREG32(qman_base + QM_CQ_TSIZE_4_OFFSET); + } lo = RREG32(qman_base + QM_CP_CURRENT_INST_LO_4_OFFSET); hi = RREG32(qman_base + QM_CP_CURRENT_INST_HI_4_OFFSET); cp_current_inst = ((u64) hi) << 32 | lo; dev_info(hdev->dev, - "LowerQM. CQ: {ptr %#llx, size %u}, ARC_CQ: {ptr %#llx, size %u}, CP: {instruction %#llx}\n", - cq_ptr, cq_ptr_size, arc_cq_ptr, arc_cq_ptr_size, cp_current_inst); + "LowerQM. %sCQ: {ptr %#llx, size %u}, CP: {instruction %#llx}\n", + is_arc_cq ? "ARC_" : "", cq_ptr, cq_ptr_size, cp_current_inst); if (event_mask & HL_NOTIFIER_EVENT_UNDEFINED_OPCODE) { - if (arc_cq_ptr) { - hdev->captured_err_info.undef_opcode.cq_addr = arc_cq_ptr; - hdev->captured_err_info.undef_opcode.cq_size = arc_cq_ptr_size; - } else { - hdev->captured_err_info.undef_opcode.cq_addr = cq_ptr; - hdev->captured_err_info.undef_opcode.cq_size = cq_ptr_size; - } - + hdev->captured_err_info.undef_opcode.cq_addr = cq_ptr; + hdev->captured_err_info.undef_opcode.cq_size = cq_ptr_size; hdev->captured_err_info.undef_opcode.stream_id = QMAN_STREAMS; } } diff --git a/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h b/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h index a08378d0802b..8018214a7b59 100644 --- a/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h +++ b/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h @@ -250,6 +250,7 @@ #define QM_ARC_CQ_PTR_HI_OFFSET (mmPDMA0_QM_ARC_CQ_PTR_HI - mmPDMA0_QM_BASE) #define QM_ARC_CQ_TSIZE_OFFSET (mmPDMA0_QM_ARC_CQ_TSIZE - mmPDMA0_QM_BASE) +#define QM_CP_STS_4_OFFSET (mmPDMA0_QM_CP_STS_4 - mmPDMA0_QM_BASE) #define QM_CP_CURRENT_INST_LO_4_OFFSET (mmPDMA0_QM_CP_CURRENT_INST_LO_4 - mmPDMA0_QM_BASE) #define QM_CP_CURRENT_INST_HI_4_OFFSET (mmPDMA0_QM_CP_CURRENT_INST_HI_4 - mmPDMA0_QM_BASE) -- cgit From c9f9d0e3d0db300315d3bde7f122439633e6f007 Mon Sep 17 00:00:00 2001 From: Dani Liberman Date: Sun, 12 Nov 2023 13:38:42 +0200 Subject: accel/habanalabs: print error code when mapping fails Failure to map is considered a non-trivial error and we need to notify the user about it. Signed-off-by: Dani Liberman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/memory.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/accel/habanalabs/common/memory.c b/drivers/accel/habanalabs/common/memory.c index 0b8689fe0b64..3348ad12c237 100644 --- a/drivers/accel/habanalabs/common/memory.c +++ b/drivers/accel/habanalabs/common/memory.c @@ -955,8 +955,8 @@ static int map_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr, (i + 1) == phys_pg_pack->npages); if (rc) { dev_err(hdev->dev, - "map failed for handle %u, npages: %llu, mapped: %llu", - phys_pg_pack->handle, phys_pg_pack->npages, + "map failed (%d) for handle %u, npages: %llu, mapped: %llu\n", + rc, phys_pg_pack->handle, phys_pg_pack->npages, mapped_pg_cnt); goto err; } @@ -1186,7 +1186,8 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args, u64 *device rc = map_phys_pg_pack(ctx, ret_vaddr, phys_pg_pack); if (rc) { - dev_err(hdev->dev, "mapping page pack failed for handle %u\n", handle); + dev_err(hdev->dev, "mapping page pack failed (%d) for handle %u\n", + rc, handle); mutex_unlock(&hdev->mmu_lock); goto map_err; } -- cgit From 47a552863d6c9ea26abe9ad35d2c35e4d6896551 Mon Sep 17 00:00:00 2001 From: Dani Liberman Date: Mon, 13 Nov 2023 21:11:13 +0200 Subject: accel/habanalabs: expose module id through sysfs Module ID exposes the physical location of the device in the server, from the pov of the devices in regard to how they are connected by internal fabric. This information is already exposed in our INFO ioctl, but there are utilities and scripts running in data-center which are already accessing sysfs for topology information and it is easier for them to continue getting that information from sysfs instead of opening a file descriptor. Signed-off-by: Dani Liberman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- Documentation/ABI/testing/sysfs-driver-habanalabs | 6 ++++++ drivers/accel/habanalabs/common/sysfs.c | 10 ++++++++++ 2 files changed, 16 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-driver-habanalabs b/Documentation/ABI/testing/sysfs-driver-habanalabs index c63ca1ad500d..89fe3b09d4ad 100644 --- a/Documentation/ABI/testing/sysfs-driver-habanalabs +++ b/Documentation/ABI/testing/sysfs-driver-habanalabs @@ -149,6 +149,12 @@ Contact: ogabbay@kernel.org Description: Displays the current clock frequency, in Hz, of the MME compute engine. This property is valid only for the Goya ASIC family +What: /sys/class/accel/accel/device/module_id +Date: Nov 2023 +KernelVersion: not yet upstreamed +Contact: ogabbay@kernel.org +Description: Displays the device's module id + What: /sys/class/accel/accel/device/pci_addr Date: Jan 2019 KernelVersion: 5.1 diff --git a/drivers/accel/habanalabs/common/sysfs.c b/drivers/accel/habanalabs/common/sysfs.c index 278606373055..8d2164691d81 100644 --- a/drivers/accel/habanalabs/common/sysfs.c +++ b/drivers/accel/habanalabs/common/sysfs.c @@ -386,6 +386,14 @@ static ssize_t security_enabled_show(struct device *dev, return sprintf(buf, "%d\n", hdev->asic_prop.fw_security_enabled); } +static ssize_t module_id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hl_device *hdev = dev_get_drvdata(dev); + + return sprintf(buf, "%u\n", le32_to_cpu(hdev->asic_prop.cpucp_info.card_location)); +} + static DEVICE_ATTR_RO(armcp_kernel_ver); static DEVICE_ATTR_RO(armcp_ver); static DEVICE_ATTR_RO(cpld_ver); @@ -405,6 +413,7 @@ static DEVICE_ATTR_RO(thermal_ver); static DEVICE_ATTR_RO(uboot_ver); static DEVICE_ATTR_RO(fw_os_ver); static DEVICE_ATTR_RO(security_enabled); +static DEVICE_ATTR_RO(module_id); static struct bin_attribute bin_attr_eeprom = { .attr = {.name = "eeprom", .mode = (0444)}, @@ -430,6 +439,7 @@ static struct attribute *hl_dev_attrs[] = { &dev_attr_uboot_ver.attr, &dev_attr_fw_os_ver.attr, &dev_attr_security_enabled.attr, + &dev_attr_module_id.attr, NULL, }; -- cgit From 5bc155cfea605cd64aa372b44a67473b49c4726c Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Fri, 17 Nov 2023 12:49:19 +0200 Subject: accel/habanalabs/gaudi2: use correct registers to dump QM CQ info The QM CQ PTR_LO/PTR_HI/TSIZE registers are for pushing a CQ entry, and although they are updated by HW even when descriptors are fetched by PQ and CB addresses are fed into CQ, the correct registers to use when dumping the CQ info are the ones with the _STS suffix. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/gaudi2/gaudi2.c | 12 ++++++------ .../accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index 77c480725a84..bf537c2082cd 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -7868,15 +7868,15 @@ static void handle_lower_qman_data_on_err(struct hl_device *hdev, u64 qman_base, is_arc_cq = FIELD_GET(PDMA0_QM_CP_STS_CUR_CQ_MASK, cp_sts); /* 0 - legacy CQ, 1 - ARC_CQ */ if (is_arc_cq) { - lo = RREG32(qman_base + QM_ARC_CQ_PTR_LO_OFFSET); - hi = RREG32(qman_base + QM_ARC_CQ_PTR_HI_OFFSET); + lo = RREG32(qman_base + QM_ARC_CQ_PTR_LO_STS_OFFSET); + hi = RREG32(qman_base + QM_ARC_CQ_PTR_HI_STS_OFFSET); cq_ptr = ((u64) hi) << 32 | lo; - cq_ptr_size = RREG32(qman_base + QM_ARC_CQ_TSIZE_OFFSET); + cq_ptr_size = RREG32(qman_base + QM_ARC_CQ_TSIZE_STS_OFFSET); } else { - lo = RREG32(qman_base + QM_CQ_PTR_LO_4_OFFSET); - hi = RREG32(qman_base + QM_CQ_PTR_HI_4_OFFSET); + lo = RREG32(qman_base + QM_CQ_PTR_LO_STS_4_OFFSET); + hi = RREG32(qman_base + QM_CQ_PTR_HI_STS_4_OFFSET); cq_ptr = ((u64) hi) << 32 | lo; - cq_ptr_size = RREG32(qman_base + QM_CQ_TSIZE_4_OFFSET); + cq_ptr_size = RREG32(qman_base + QM_CQ_TSIZE_STS_4_OFFSET); } lo = RREG32(qman_base + QM_CP_CURRENT_INST_LO_4_OFFSET); diff --git a/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h b/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h index 8018214a7b59..d21fcd3880b4 100644 --- a/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h +++ b/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h @@ -242,13 +242,13 @@ #define QM_FENCE2_OFFSET (mmPDMA0_QM_CP_FENCE2_RDATA_0 - mmPDMA0_QM_BASE) #define QM_SEI_STATUS_OFFSET (mmPDMA0_QM_SEI_STATUS - mmPDMA0_QM_BASE) -#define QM_CQ_PTR_LO_4_OFFSET (mmPDMA0_QM_CQ_PTR_LO_4 - mmPDMA0_QM_BASE) -#define QM_CQ_PTR_HI_4_OFFSET (mmPDMA0_QM_CQ_PTR_HI_4 - mmPDMA0_QM_BASE) -#define QM_CQ_TSIZE_4_OFFSET (mmPDMA0_QM_CQ_TSIZE_4 - mmPDMA0_QM_BASE) +#define QM_CQ_TSIZE_STS_4_OFFSET (mmPDMA0_QM_CQ_TSIZE_STS_4 - mmPDMA0_QM_BASE) +#define QM_CQ_PTR_LO_STS_4_OFFSET (mmPDMA0_QM_CQ_PTR_LO_STS_4 - mmPDMA0_QM_BASE) +#define QM_CQ_PTR_HI_STS_4_OFFSET (mmPDMA0_QM_CQ_PTR_HI_STS_4 - mmPDMA0_QM_BASE) -#define QM_ARC_CQ_PTR_LO_OFFSET (mmPDMA0_QM_ARC_CQ_PTR_LO - mmPDMA0_QM_BASE) -#define QM_ARC_CQ_PTR_HI_OFFSET (mmPDMA0_QM_ARC_CQ_PTR_HI - mmPDMA0_QM_BASE) -#define QM_ARC_CQ_TSIZE_OFFSET (mmPDMA0_QM_ARC_CQ_TSIZE - mmPDMA0_QM_BASE) +#define QM_ARC_CQ_TSIZE_STS_OFFSET (mmPDMA0_QM_ARC_CQ_TSIZE_STS - mmPDMA0_QM_BASE) +#define QM_ARC_CQ_PTR_LO_STS_OFFSET (mmPDMA0_QM_ARC_CQ_PTR_LO_STS - mmPDMA0_QM_BASE) +#define QM_ARC_CQ_PTR_HI_STS_OFFSET (mmPDMA0_QM_ARC_CQ_PTR_HI_STS - mmPDMA0_QM_BASE) #define QM_CP_STS_4_OFFSET (mmPDMA0_QM_CP_STS_4 - mmPDMA0_QM_BASE) #define QM_CP_CURRENT_INST_LO_4_OFFSET (mmPDMA0_QM_CP_CURRENT_INST_LO_4 - mmPDMA0_QM_BASE) -- cgit From 7259eb7b534735b9c1153654c0bb4c5f059c0dd3 Mon Sep 17 00:00:00 2001 From: Moti Haimovski Date: Sun, 12 Nov 2023 18:07:10 +0200 Subject: accel/habanalabs/gaudi2: add signed dev info uAPI User will provide a nonce via the INFO ioctl, and will retrieve the signed device info generated using given nonce. Signed-off-by: Moti Haimovski Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/firmware_if.c | 8 ++++ drivers/accel/habanalabs/common/habanalabs.h | 2 + drivers/accel/habanalabs/common/habanalabs_ioctl.c | 53 ++++++++++++++++++++++ include/linux/habanalabs/cpucp_if.h | 8 +++- include/uapi/drm/habanalabs_accel.h | 28 ++++++++++++ 5 files changed, 98 insertions(+), 1 deletion(-) diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c index 9e9dfe013659..3558a6a8e192 100644 --- a/drivers/accel/habanalabs/common/firmware_if.c +++ b/drivers/accel/habanalabs/common/firmware_if.c @@ -3244,6 +3244,14 @@ int hl_fw_get_sec_attest_info(struct hl_device *hdev, struct cpucp_sec_attest_in HL_CPUCP_SEC_ATTEST_INFO_TINEOUT_USEC); } +int hl_fw_get_dev_info_signed(struct hl_device *hdev, + struct cpucp_dev_info_signed *dev_info_signed, u32 nonce) +{ + return hl_fw_get_sec_attest_data(hdev, CPUCP_PACKET_INFO_SIGNED_GET, dev_info_signed, + sizeof(struct cpucp_dev_info_signed), nonce, + HL_CPUCP_SEC_ATTEST_INFO_TINEOUT_USEC); +} + int hl_fw_send_generic_request(struct hl_device *hdev, enum hl_passthrough_type sub_opcode, dma_addr_t buff, u32 *size) { diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h index 7b0209e5bad6..dd3fe3ddc00a 100644 --- a/drivers/accel/habanalabs/common/habanalabs.h +++ b/drivers/accel/habanalabs/common/habanalabs.h @@ -3964,6 +3964,8 @@ long hl_fw_get_max_power(struct hl_device *hdev); void hl_fw_set_max_power(struct hl_device *hdev); int hl_fw_get_sec_attest_info(struct hl_device *hdev, struct cpucp_sec_attest_info *sec_attest_info, u32 nonce); +int hl_fw_get_dev_info_signed(struct hl_device *hdev, + struct cpucp_dev_info_signed *dev_info_signed, u32 nonce); int hl_set_voltage(struct hl_device *hdev, int sensor_index, u32 attr, long value); int hl_set_current(struct hl_device *hdev, int sensor_index, u32 attr, long value); int hl_set_power(struct hl_device *hdev, int sensor_index, u32 attr, long value); diff --git a/drivers/accel/habanalabs/common/habanalabs_ioctl.c b/drivers/accel/habanalabs/common/habanalabs_ioctl.c index 8ef36effb95b..a92713e0e580 100644 --- a/drivers/accel/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/accel/habanalabs/common/habanalabs_ioctl.c @@ -19,6 +19,9 @@ #include +/* make sure there is space for all the signed info */ +static_assert(sizeof(struct cpucp_info) <= SEC_DEV_INFO_BUF_SZ); + static u32 hl_debug_struct_size[HL_DEBUG_OP_TIMESTAMP + 1] = { [HL_DEBUG_OP_ETR] = sizeof(struct hl_debug_params_etr), [HL_DEBUG_OP_ETF] = sizeof(struct hl_debug_params_etf), @@ -719,6 +722,53 @@ free_sec_attest_info: return rc; } +static int dev_info_signed(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + struct cpucp_dev_info_signed *dev_info_signed; + struct hl_info_signed *info; + u32 max_size = args->return_size; + int rc; + + if ((!max_size) || (!out)) + return -EINVAL; + + dev_info_signed = kzalloc(sizeof(*dev_info_signed), GFP_KERNEL); + if (!dev_info_signed) + return -ENOMEM; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) { + rc = -ENOMEM; + goto free_dev_info_signed; + } + + rc = hl_fw_get_dev_info_signed(hpriv->hdev, + dev_info_signed, args->sec_attest_nonce); + if (rc) + goto free_info; + + info->nonce = le32_to_cpu(dev_info_signed->nonce); + info->info_sig_len = dev_info_signed->info_sig_len; + info->pub_data_len = le16_to_cpu(dev_info_signed->pub_data_len); + info->certificate_len = le16_to_cpu(dev_info_signed->certificate_len); + info->dev_info_len = sizeof(struct cpucp_info); + memcpy(&info->info_sig, &dev_info_signed->info_sig, sizeof(info->info_sig)); + memcpy(&info->public_data, &dev_info_signed->public_data, sizeof(info->public_data)); + memcpy(&info->certificate, &dev_info_signed->certificate, sizeof(info->certificate)); + memcpy(&info->dev_info, &dev_info_signed->info, info->dev_info_len); + + rc = copy_to_user(out, info, min_t(size_t, max_size, sizeof(*info))) ? -EFAULT : 0; + +free_info: + kfree(info); +free_dev_info_signed: + kfree(dev_info_signed); + + return rc; +} + + static int eventfd_register(struct hl_fpriv *hpriv, struct hl_info_args *args) { int rc; @@ -1089,6 +1139,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, case HL_INFO_FW_GENERIC_REQ: return send_fw_generic_request(hdev, args); + case HL_INFO_DEV_SIGNED: + return dev_info_signed(hpriv, args); + default: dev_err(dev, "Invalid request %d\n", args->op); rc = -EINVAL; diff --git a/include/linux/habanalabs/cpucp_if.h b/include/linux/habanalabs/cpucp_if.h index 86ea7c63a0d2..f316c8d0f3fc 100644 --- a/include/linux/habanalabs/cpucp_if.h +++ b/include/linux/habanalabs/cpucp_if.h @@ -659,6 +659,12 @@ enum pq_init_status { * number (nonce) provided by the host to prevent replay attacks. * public key and certificate also provided as part of the FW response. * + * CPUCP_PACKET_INFO_SIGNED_GET - + * Get the device information signed by the Trusted Platform device. + * device info data is also hashed with some unique number (nonce) provided + * by the host to prevent replay attacks. public key and certificate also + * provided as part of the FW response. + * * CPUCP_PACKET_MONITOR_DUMP_GET - * Get monitors registers dump from the CpuCP kernel. * The CPU will put the registers dump in the a buffer allocated by the driver @@ -733,7 +739,7 @@ enum cpucp_packet_id { CPUCP_PACKET_ENGINE_CORE_ASID_SET, /* internal */ CPUCP_PACKET_RESERVED2, /* not used */ CPUCP_PACKET_SEC_ATTEST_GET, /* internal */ - CPUCP_PACKET_RESERVED3, /* not used */ + CPUCP_PACKET_INFO_SIGNED_GET, /* internal */ CPUCP_PACKET_RESERVED4, /* not used */ CPUCP_PACKET_MONITOR_DUMP_GET, /* debugfs */ CPUCP_PACKET_RESERVED5, /* not used */ diff --git a/include/uapi/drm/habanalabs_accel.h b/include/uapi/drm/habanalabs_accel.h index 347c7b62e60e..a512dc4cffd0 100644 --- a/include/uapi/drm/habanalabs_accel.h +++ b/include/uapi/drm/habanalabs_accel.h @@ -846,6 +846,7 @@ enum hl_server_type { #define HL_INFO_HW_ERR_EVENT 36 #define HL_INFO_FW_ERR_EVENT 37 #define HL_INFO_USER_ENGINE_ERR_EVENT 38 +#define HL_INFO_DEV_SIGNED 40 #define HL_INFO_VERSION_MAX_LEN 128 #define HL_INFO_CARD_NAME_MAX_LEN 16 @@ -1256,6 +1257,7 @@ struct hl_info_dev_memalloc_page_sizes { #define SEC_SIGNATURE_BUF_SZ 255 /* (256 - 1) 1 byte used for size */ #define SEC_PUB_DATA_BUF_SZ 510 /* (512 - 2) 2 bytes used for size */ #define SEC_CERTIFICATE_BUF_SZ 2046 /* (2048 - 2) 2 bytes used for size */ +#define SEC_DEV_INFO_BUF_SZ 5120 /* * struct hl_info_sec_attest - attestation report of the boot @@ -1290,6 +1292,32 @@ struct hl_info_sec_attest { __u8 pad0[2]; }; +/* + * struct hl_info_signed - device information signed by a secured device. + * @nonce: number only used once. random number provided by host. this also passed to the quote + * command as a qualifying data. + * @pub_data_len: length of the public data (bytes) + * @certificate_len: length of the certificate (bytes) + * @info_sig_len: length of the attestation signature (bytes) + * @public_data: public key info signed info data (outPublic + name + qualifiedName) + * @certificate: certificate for the signing key + * @info_sig: signature of the info + nonce data. + * @dev_info_len: length of device info (bytes) + * @dev_info: device info as byte array. + */ +struct hl_info_signed { + __u32 nonce; + __u16 pub_data_len; + __u16 certificate_len; + __u8 info_sig_len; + __u8 public_data[SEC_PUB_DATA_BUF_SZ]; + __u8 certificate[SEC_CERTIFICATE_BUF_SZ]; + __u8 info_sig[SEC_SIGNATURE_BUF_SZ]; + __u16 dev_info_len; + __u8 dev_info[SEC_DEV_INFO_BUF_SZ]; + __u8 pad[2]; +}; + /** * struct hl_page_fault_info - page fault information. * @timestamp: timestamp of page fault. -- cgit From d980e1ced9899451d2d2a84030ce9f2a04eae7e8 Mon Sep 17 00:00:00 2001 From: Ariel Suller Date: Thu, 23 Nov 2023 16:27:07 +0200 Subject: accel/habanalabs: report 3 instances of Infineon second stage Infineon controller second stage has 3 instances that their version need to be reported by driver. Signed-off-by: Ariel Suller Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/sysfs.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/accel/habanalabs/common/sysfs.c b/drivers/accel/habanalabs/common/sysfs.c index 8d2164691d81..c940c5f1d109 100644 --- a/drivers/accel/habanalabs/common/sysfs.c +++ b/drivers/accel/habanalabs/common/sysfs.c @@ -8,6 +8,7 @@ #include "habanalabs.h" #include +#include static ssize_t clk_max_freq_mhz_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -80,12 +81,27 @@ static ssize_t vrm_ver_show(struct device *dev, struct device_attribute *attr, c { struct hl_device *hdev = dev_get_drvdata(dev); struct cpucp_info *cpucp_info; + u32 infineon_second_stage_version; + u32 infineon_second_stage_first_instance; + u32 infineon_second_stage_second_instance; + u32 infineon_second_stage_third_instance; + u32 mask = 0xff; cpucp_info = &hdev->asic_prop.cpucp_info; + infineon_second_stage_version = le32_to_cpu(cpucp_info->infineon_second_stage_version); + infineon_second_stage_first_instance = infineon_second_stage_version & mask; + infineon_second_stage_second_instance = + (infineon_second_stage_version >> 8) & mask; + infineon_second_stage_third_instance = + (infineon_second_stage_version >> 16) & mask; + if (cpucp_info->infineon_second_stage_version) - return sprintf(buf, "%#04x %#04x\n", le32_to_cpu(cpucp_info->infineon_version), - le32_to_cpu(cpucp_info->infineon_second_stage_version)); + return sprintf(buf, "%#04x %#04x:%#04x:%#04x\n", + le32_to_cpu(cpucp_info->infineon_version), + infineon_second_stage_first_instance, + infineon_second_stage_second_instance, + infineon_second_stage_third_instance); else return sprintf(buf, "%#04x\n", le32_to_cpu(cpucp_info->infineon_version)); } -- cgit From 565ee78840906da33f124c143e0c788e42f824e8 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Thu, 23 Nov 2023 09:49:36 +0200 Subject: accel/habanalabs/gaudi2: add zero padding when printing QM CP instruction QM instructions are in multiples of 64 bits and the command type is in the upper bits of first QWORD. To make it clearer that an undefined command is due to a type of 0x0, always print all 64 bits and add a zero padding if needed. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/gaudi2/gaudi2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index bf537c2082cd..f81b57649b00 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -7884,7 +7884,7 @@ static void handle_lower_qman_data_on_err(struct hl_device *hdev, u64 qman_base, cp_current_inst = ((u64) hi) << 32 | lo; dev_info(hdev->dev, - "LowerQM. %sCQ: {ptr %#llx, size %u}, CP: {instruction %#llx}\n", + "LowerQM. %sCQ: {ptr %#llx, size %u}, CP: {instruction %#018llx}\n", is_arc_cq ? "ARC_" : "", cq_ptr, cq_ptr_size, cp_current_inst); if (event_mask & HL_NOTIFIER_EVENT_UNDEFINED_OPCODE) { -- cgit From cf0719a8a3e72cb82d83f79aa57ae11d86324915 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Wed, 27 Sep 2023 18:50:30 +0300 Subject: accel/habanalabs: update debugfs-driver-habanalabs with the device-name directory The device debugfs directory was modified to be named as the parent device name. Update the paths accordingly. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- .../ABI/testing/debugfs-driver-habanalabs | 72 +++++++++++----------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/Documentation/ABI/testing/debugfs-driver-habanalabs b/Documentation/ABI/testing/debugfs-driver-habanalabs index 042fd125fbc9..a7a432dc4015 100644 --- a/Documentation/ABI/testing/debugfs-driver-habanalabs +++ b/Documentation/ABI/testing/debugfs-driver-habanalabs @@ -1,4 +1,4 @@ -What: /sys/kernel/debug/accel//addr +What: /sys/kernel/debug/accel//addr Date: Jan 2019 KernelVersion: 5.1 Contact: ogabbay@kernel.org @@ -8,34 +8,34 @@ Description: Sets the device address to be used for read or write through only when the IOMMU is disabled. The acceptable value is a string that starts with "0x" -What: /sys/kernel/debug/accel//clk_gate +What: /sys/kernel/debug/accel//clk_gate Date: May 2020 KernelVersion: 5.8 Contact: ogabbay@kernel.org Description: This setting is now deprecated as clock gating is handled solely by the f/w -What: /sys/kernel/debug/accel//command_buffers +What: /sys/kernel/debug/accel//command_buffers Date: Jan 2019 KernelVersion: 5.1 Contact: ogabbay@kernel.org Description: Displays a list with information about the currently allocated command buffers -What: /sys/kernel/debug/accel//command_submission +What: /sys/kernel/debug/accel//command_submission Date: Jan 2019 KernelVersion: 5.1 Contact: ogabbay@kernel.org Description: Displays a list with information about the currently active command submissions -What: /sys/kernel/debug/accel//command_submission_jobs +What: /sys/kernel/debug/accel//command_submission_jobs Date: Jan 2019 KernelVersion: 5.1 Contact: ogabbay@kernel.org Description: Displays a list with detailed information about each JOB (CB) of each active command submission -What: /sys/kernel/debug/accel//data32 +What: /sys/kernel/debug/accel//data32 Date: Jan 2019 KernelVersion: 5.1 Contact: ogabbay@kernel.org @@ -50,7 +50,7 @@ Description: Allows the root user to read or write directly through the If the IOMMU is disabled, it also allows the root user to read or write from the host a device VA of a host mapped memory -What: /sys/kernel/debug/accel//data64 +What: /sys/kernel/debug/accel//data64 Date: Jan 2020 KernelVersion: 5.6 Contact: ogabbay@kernel.org @@ -65,7 +65,7 @@ Description: Allows the root user to read or write 64 bit data directly If the IOMMU is disabled, it also allows the root user to read or write from the host a device VA of a host mapped memory -What: /sys/kernel/debug/accel//data_dma +What: /sys/kernel/debug/accel//data_dma Date: Apr 2021 KernelVersion: 5.13 Contact: ogabbay@kernel.org @@ -83,7 +83,7 @@ Description: Allows the root user to read from the device's internal workloads. Only supported on GAUDI at this stage. -What: /sys/kernel/debug/accel//device +What: /sys/kernel/debug/accel//device Date: Jan 2019 KernelVersion: 5.1 Contact: ogabbay@kernel.org @@ -91,14 +91,14 @@ Description: Enables the root user to set the device to specific state. Valid values are "disable", "enable", "suspend", "resume". User can read this property to see the valid values -What: /sys/kernel/debug/accel//device_release_watchdog_timeout +What: /sys/kernel/debug/accel//device_release_watchdog_timeout Date: Oct 2022 KernelVersion: 6.2 Contact: ttayar@habana.ai Description: The watchdog timeout value in seconds for a device release upon certain error cases, after which the device is reset. -What: /sys/kernel/debug/accel//dma_size +What: /sys/kernel/debug/accel//dma_size Date: Apr 2021 KernelVersion: 5.13 Contact: ogabbay@kernel.org @@ -108,7 +108,7 @@ Description: Specify the size of the DMA transaction when using DMA to read When the write is finished, the user can read the "data_dma" blob -What: /sys/kernel/debug/accel//dump_razwi_events +What: /sys/kernel/debug/accel//dump_razwi_events Date: Aug 2022 KernelVersion: 5.20 Contact: fkassabri@habana.ai @@ -117,7 +117,7 @@ Description: Dumps all razwi events to dmesg if exist. the routine will clear the status register. Usage: cat dump_razwi_events -What: /sys/kernel/debug/accel//dump_security_violations +What: /sys/kernel/debug/accel//dump_security_violations Date: Jan 2021 KernelVersion: 5.12 Contact: ogabbay@kernel.org @@ -125,14 +125,14 @@ Description: Dumps all security violations to dmesg. This will also ack all security violations meanings those violations will not be dumped next time user calls this API -What: /sys/kernel/debug/accel//engines +What: /sys/kernel/debug/accel//engines Date: Jul 2019 KernelVersion: 5.3 Contact: ogabbay@kernel.org Description: Displays the status registers values of the device engines and their derived idle status -What: /sys/kernel/debug/accel//i2c_addr +What: /sys/kernel/debug/accel//i2c_addr Date: Jan 2019 KernelVersion: 5.1 Contact: ogabbay@kernel.org @@ -140,7 +140,7 @@ Description: Sets I2C device address for I2C transaction that is generated by the device's CPU, Not available when device is loaded with secured firmware -What: /sys/kernel/debug/accel//i2c_bus +What: /sys/kernel/debug/accel//i2c_bus Date: Jan 2019 KernelVersion: 5.1 Contact: ogabbay@kernel.org @@ -148,7 +148,7 @@ Description: Sets I2C bus address for I2C transaction that is generated by the device's CPU, Not available when device is loaded with secured firmware -What: /sys/kernel/debug/accel//i2c_data +What: /sys/kernel/debug/accel//i2c_data Date: Jan 2019 KernelVersion: 5.1 Contact: ogabbay@kernel.org @@ -157,7 +157,7 @@ Description: Triggers an I2C transaction that is generated by the device's reading from the file generates a read transaction, Not available when device is loaded with secured firmware -What: /sys/kernel/debug/accel//i2c_len +What: /sys/kernel/debug/accel//i2c_len Date: Dec 2021 KernelVersion: 5.17 Contact: obitton@habana.ai @@ -165,7 +165,7 @@ Description: Sets I2C length in bytes for I2C transaction that is generated b the device's CPU, Not available when device is loaded with secured firmware -What: /sys/kernel/debug/accel//i2c_reg +What: /sys/kernel/debug/accel//i2c_reg Date: Jan 2019 KernelVersion: 5.1 Contact: ogabbay@kernel.org @@ -173,35 +173,35 @@ Description: Sets I2C register id for I2C transaction that is generated by the device's CPU, Not available when device is loaded with secured firmware -What: /sys/kernel/debug/accel//led0 +What: /sys/kernel/debug/accel//led0 Date: Jan 2019 KernelVersion: 5.1 Contact: ogabbay@kernel.org Description: Sets the state of the first S/W led on the device, Not available when device is loaded with secured firmware -What: /sys/kernel/debug/accel//led1 +What: /sys/kernel/debug/accel//led1 Date: Jan 2019 KernelVersion: 5.1 Contact: ogabbay@kernel.org Description: Sets the state of the second S/W led on the device, Not available when device is loaded with secured firmware -What: /sys/kernel/debug/accel//led2 +What: /sys/kernel/debug/accel//led2 Date: Jan 2019 KernelVersion: 5.1 Contact: ogabbay@kernel.org Description: Sets the state of the third S/W led on the device, Not available when device is loaded with secured firmware -What: /sys/kernel/debug/accel//memory_scrub +What: /sys/kernel/debug/accel//memory_scrub Date: May 2022 KernelVersion: 5.19 Contact: dhirschfeld@habana.ai Description: Allows the root user to scrub the dram memory. The scrubbing value can be set using the debugfs file memory_scrub_val. -What: /sys/kernel/debug/accel//memory_scrub_val +What: /sys/kernel/debug/accel//memory_scrub_val Date: May 2022 KernelVersion: 5.19 Contact: dhirschfeld@habana.ai @@ -209,7 +209,7 @@ Description: The value to which the dram will be set to when the user scrubs the dram using 'memory_scrub' debugfs file and the scrubbing value when using module param 'memory_scrub' -What: /sys/kernel/debug/accel//mmu +What: /sys/kernel/debug/accel//mmu Date: Jan 2019 KernelVersion: 5.1 Contact: ogabbay@kernel.org @@ -219,7 +219,7 @@ Description: Displays the hop values and physical address for a given ASID e.g. to display info about VA 0x1000 for ASID 1 you need to do: echo "1 0x1000" > /sys/kernel/debug/accel/0/mmu -What: /sys/kernel/debug/accel//mmu_error +What: /sys/kernel/debug/accel//mmu_error Date: Mar 2021 KernelVersion: 5.12 Contact: fkassabri@habana.ai @@ -229,7 +229,7 @@ Description: Check and display page fault or access violation mmu errors for echo "0x200" > /sys/kernel/debug/accel/0/mmu_error cat /sys/kernel/debug/accel/0/mmu_error -What: /sys/kernel/debug/accel//monitor_dump +What: /sys/kernel/debug/accel//monitor_dump Date: Mar 2022 KernelVersion: 5.19 Contact: osharabi@habana.ai @@ -243,7 +243,7 @@ Description: Allows the root user to dump monitors status from the device's This interface doesn't support concurrency in the same device. Only supported on GAUDI. -What: /sys/kernel/debug/accel//monitor_dump_trig +What: /sys/kernel/debug/accel//monitor_dump_trig Date: Mar 2022 KernelVersion: 5.19 Contact: osharabi@habana.ai @@ -253,14 +253,14 @@ Description: Triggers dump of monitor data. The value to trigger the operatio When the write is finished, the user can read the "monitor_dump" blob -What: /sys/kernel/debug/accel//set_power_state +What: /sys/kernel/debug/accel//set_power_state Date: Jan 2019 KernelVersion: 5.1 Contact: ogabbay@kernel.org Description: Sets the PCI power state. Valid values are "1" for D0 and "2" for D3Hot -What: /sys/kernel/debug/accel//skip_reset_on_timeout +What: /sys/kernel/debug/accel//skip_reset_on_timeout Date: Jun 2021 KernelVersion: 5.13 Contact: ynudelman@habana.ai @@ -268,7 +268,7 @@ Description: Sets the skip reset on timeout option for the device. Value of "0" means device will be reset in case some CS has timed out, otherwise it will not be reset. -What: /sys/kernel/debug/accel//state_dump +What: /sys/kernel/debug/accel//state_dump Date: Oct 2021 KernelVersion: 5.15 Contact: ynudelman@habana.ai @@ -279,7 +279,7 @@ Description: Gets the state dump occurring on a CS timeout or failure. Writing an integer X discards X state dumps, so that the next read would return X+1-st newest state dump. -What: /sys/kernel/debug/accel//stop_on_err +What: /sys/kernel/debug/accel//stop_on_err Date: Mar 2020 KernelVersion: 5.6 Contact: ogabbay@kernel.org @@ -287,13 +287,13 @@ Description: Sets the stop-on_error option for the device engines. Value of "0" is for disable, otherwise enable. Relevant only for GOYA and GAUDI. -What: /sys/kernel/debug/accel//timeout_locked +What: /sys/kernel/debug/accel//timeout_locked Date: Sep 2021 KernelVersion: 5.16 Contact: obitton@habana.ai Description: Sets the command submission timeout value in seconds. -What: /sys/kernel/debug/accel//userptr +What: /sys/kernel/debug/accel//userptr Date: Jan 2019 KernelVersion: 5.1 Contact: ogabbay@kernel.org @@ -301,7 +301,7 @@ Description: Displays a list with information about the current user pointers (user virtual addresses) that are pinned and mapped to DMA addresses -What: /sys/kernel/debug/accel//userptr_lookup +What: /sys/kernel/debug/accel//userptr_lookup Date: Oct 2021 KernelVersion: 5.15 Contact: ogabbay@kernel.org @@ -309,7 +309,7 @@ Description: Allows to search for specific user pointers (user virtual addresses) that are pinned and mapped to DMA addresses, and see their resolution to the specific dma address. -What: /sys/kernel/debug/accel//vm +What: /sys/kernel/debug/accel//vm Date: Jan 2019 KernelVersion: 5.1 Contact: ogabbay@kernel.org -- cgit From aa5cea38ce687021bf97f9f4cdb18b26db290964 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Wed, 3 May 2023 13:19:17 +0300 Subject: accel/habanalabs: add parent_device sysfs attribute The device debugfs directory was modified to be named as the device-name. This name is the parent device name, i.e. either the PCI address in case of an ASIC, or the simulator device name in case of a simulator. This change makes it more difficult for a user to access the debugfs directory for a specific accel device, because he can't just use the accel minor id, but he needs to do more device-dependent operations to get the device name. To make it easier to get this name, add a 'parent_device' sysfs attribute that the user can read using the minor id before accessing debugfs. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- Documentation/ABI/testing/sysfs-driver-habanalabs | 6 ++++++ drivers/accel/habanalabs/common/habanalabs.h | 3 +++ drivers/accel/habanalabs/common/sysfs.c | 9 +++++++++ 3 files changed, 18 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-driver-habanalabs b/Documentation/ABI/testing/sysfs-driver-habanalabs index 89fe3b09d4ad..4244f5af4b54 100644 --- a/Documentation/ABI/testing/sysfs-driver-habanalabs +++ b/Documentation/ABI/testing/sysfs-driver-habanalabs @@ -155,6 +155,12 @@ KernelVersion: not yet upstreamed Contact: ogabbay@kernel.org Description: Displays the device's module id +What: /sys/class/accel/accel/device/parent_device +Date: Nov 2023 +KernelVersion: 6.8 +Contact: ttayar@habana.ai +Description: Displays the name of the parent device of the accel device + What: /sys/class/accel/accel/device/pci_addr Date: Jan 2019 KernelVersion: 5.1 diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h index dd3fe3ddc00a..2a900c9941fe 100644 --- a/drivers/accel/habanalabs/common/habanalabs.h +++ b/drivers/accel/habanalabs/common/habanalabs.h @@ -3521,6 +3521,9 @@ struct hl_device { u8 heartbeat; }; +/* Retrieve PCI device name in case of a PCI device or dev name in simulator */ +#define HL_DEV_NAME(hdev) \ + ((hdev)->pdev ? dev_name(&(hdev)->pdev->dev) : "NA-DEVICE") /** * struct hl_cs_encaps_sig_handle - encapsulated signals handle structure diff --git a/drivers/accel/habanalabs/common/sysfs.c b/drivers/accel/habanalabs/common/sysfs.c index c940c5f1d109..8a9f98832157 100644 --- a/drivers/accel/habanalabs/common/sysfs.c +++ b/drivers/accel/habanalabs/common/sysfs.c @@ -410,6 +410,13 @@ static ssize_t module_id_show(struct device *dev, return sprintf(buf, "%u\n", le32_to_cpu(hdev->asic_prop.cpucp_info.card_location)); } +static ssize_t parent_device_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct hl_device *hdev = dev_get_drvdata(dev); + + return sprintf(buf, "%s\n", HL_DEV_NAME(hdev)); +} + static DEVICE_ATTR_RO(armcp_kernel_ver); static DEVICE_ATTR_RO(armcp_ver); static DEVICE_ATTR_RO(cpld_ver); @@ -430,6 +437,7 @@ static DEVICE_ATTR_RO(uboot_ver); static DEVICE_ATTR_RO(fw_os_ver); static DEVICE_ATTR_RO(security_enabled); static DEVICE_ATTR_RO(module_id); +static DEVICE_ATTR_RO(parent_device); static struct bin_attribute bin_attr_eeprom = { .attr = {.name = "eeprom", .mode = (0444)}, @@ -456,6 +464,7 @@ static struct attribute *hl_dev_attrs[] = { &dev_attr_fw_os_ver.attr, &dev_attr_security_enabled.attr, &dev_attr_module_id.attr, + &dev_attr_parent_device.attr, NULL, }; -- cgit From bc5f15abcf95ce7e4c2e33daddcb5850ee5e671d Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Wed, 29 Nov 2023 16:20:31 +0200 Subject: accel/habanalabs/gaudi2: avoid overriding existing undefined opcode data Part of the undefined opcode data is updated in gaudi2_handle_qman_err_generic() and some in handle_lower_qman_data_on_err(). However, the 'write_enable' flag is checked only in gaudi2_handle_qman_err_generic(), and information of more than a single error can be mixed there. Moreover, handle_lower_qman_data_on_err() is called only for the lower QMAN, so for an error in the upper QMAN there is only a partial info. Move all the data update to be done in a single place, protected by the 'write_enable' flag. As mainly the lower QMAN's info is interesting, avoid saving the partial info for the upper QMAN. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/gaudi2/gaudi2.c | 40 +++++++++++++++----------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index f81b57649b00..e0e5615ef9b0 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -7858,10 +7858,11 @@ static bool gaudi2_handle_ecc_event(struct hl_device *hdev, u16 event_type, return !!ecc_data->is_critical; } -static void handle_lower_qman_data_on_err(struct hl_device *hdev, u64 qman_base, u64 event_mask) +static void handle_lower_qman_data_on_err(struct hl_device *hdev, u64 qman_base, u32 engine_id) { - u32 lo, hi, cq_ptr_size, cp_sts; + struct undefined_opcode_info *undef_opcode = &hdev->captured_err_info.undef_opcode; u64 cq_ptr, cp_current_inst; + u32 lo, hi, cq_size, cp_sts; bool is_arc_cq; cp_sts = RREG32(qman_base + QM_CP_STS_4_OFFSET); @@ -7871,12 +7872,12 @@ static void handle_lower_qman_data_on_err(struct hl_device *hdev, u64 qman_base, lo = RREG32(qman_base + QM_ARC_CQ_PTR_LO_STS_OFFSET); hi = RREG32(qman_base + QM_ARC_CQ_PTR_HI_STS_OFFSET); cq_ptr = ((u64) hi) << 32 | lo; - cq_ptr_size = RREG32(qman_base + QM_ARC_CQ_TSIZE_STS_OFFSET); + cq_size = RREG32(qman_base + QM_ARC_CQ_TSIZE_STS_OFFSET); } else { lo = RREG32(qman_base + QM_CQ_PTR_LO_STS_4_OFFSET); hi = RREG32(qman_base + QM_CQ_PTR_HI_STS_4_OFFSET); cq_ptr = ((u64) hi) << 32 | lo; - cq_ptr_size = RREG32(qman_base + QM_CQ_TSIZE_STS_4_OFFSET); + cq_size = RREG32(qman_base + QM_CQ_TSIZE_STS_4_OFFSET); } lo = RREG32(qman_base + QM_CP_CURRENT_INST_LO_4_OFFSET); @@ -7885,12 +7886,16 @@ static void handle_lower_qman_data_on_err(struct hl_device *hdev, u64 qman_base, dev_info(hdev->dev, "LowerQM. %sCQ: {ptr %#llx, size %u}, CP: {instruction %#018llx}\n", - is_arc_cq ? "ARC_" : "", cq_ptr, cq_ptr_size, cp_current_inst); + is_arc_cq ? "ARC_" : "", cq_ptr, cq_size, cp_current_inst); - if (event_mask & HL_NOTIFIER_EVENT_UNDEFINED_OPCODE) { - hdev->captured_err_info.undef_opcode.cq_addr = cq_ptr; - hdev->captured_err_info.undef_opcode.cq_size = cq_ptr_size; - hdev->captured_err_info.undef_opcode.stream_id = QMAN_STREAMS; + if (undef_opcode->write_enable) { + memset(undef_opcode, 0, sizeof(*undef_opcode)); + undef_opcode->timestamp = ktime_get(); + undef_opcode->cq_addr = cq_ptr; + undef_opcode->cq_size = cq_size; + undef_opcode->engine_id = engine_id; + undef_opcode->stream_id = QMAN_STREAMS; + undef_opcode->write_enable = 0; } } @@ -7929,19 +7934,12 @@ static int gaudi2_handle_qman_err_generic(struct hl_device *hdev, u16 event_type error_count++; } - /* check for undefined opcode */ - if (glbl_sts_val & PDMA0_QM_GLBL_ERR_STS_CP_UNDEF_CMD_ERR_MASK) { + /* Check for undefined opcode error in lower QM */ + if ((i == QMAN_STREAMS) && + (glbl_sts_val & PDMA0_QM_GLBL_ERR_STS_CP_UNDEF_CMD_ERR_MASK)) { + handle_lower_qman_data_on_err(hdev, qman_base, + gaudi2_queue_id_to_engine_id[qid_base]); *event_mask |= HL_NOTIFIER_EVENT_UNDEFINED_OPCODE; - if (hdev->captured_err_info.undef_opcode.write_enable) { - memset(&hdev->captured_err_info.undef_opcode, 0, - sizeof(hdev->captured_err_info.undef_opcode)); - hdev->captured_err_info.undef_opcode.timestamp = ktime_get(); - hdev->captured_err_info.undef_opcode.engine_id = - gaudi2_queue_id_to_engine_id[qid_base]; - } - - if (i == QMAN_STREAMS) - handle_lower_qman_data_on_err(hdev, qman_base, *event_mask); } } -- cgit From a9f07790a4b2250f0140e9a61c7f842fd9b618c7 Mon Sep 17 00:00:00 2001 From: Xingyuan Mo Date: Fri, 8 Dec 2023 21:00:59 +0800 Subject: accel/habanalabs: fix information leak in sec_attest_info() This function may copy the pad0 field of struct hl_info_sec_attest to user mode which has not been initialized, resulting in leakage of kernel heap data to user mode. To prevent this, use kzalloc() to allocate and zero out the buffer, which can also eliminate other uninitialized holes, if any. Fixes: 0c88760f8f5e ("habanalabs/gaudi2: add secured attestation info uapi") Signed-off-by: Xingyuan Mo Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/accel/habanalabs/common/habanalabs_ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/accel/habanalabs/common/habanalabs_ioctl.c b/drivers/accel/habanalabs/common/habanalabs_ioctl.c index a92713e0e580..1dd6e23172ca 100644 --- a/drivers/accel/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/accel/habanalabs/common/habanalabs_ioctl.c @@ -688,7 +688,7 @@ static int sec_attest_info(struct hl_fpriv *hpriv, struct hl_info_args *args) if (!sec_attest_info) return -ENOMEM; - info = kmalloc(sizeof(*info), GFP_KERNEL); + info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) { rc = -ENOMEM; goto free_sec_attest_info; -- cgit