diff options
author | Moti Haimovski <mhaimovski@habana.ai> | 2023-01-10 17:35:31 +0200 |
---|---|---|
committer | Oded Gabbay <ogabbay@kernel.org> | 2023-03-15 13:29:12 +0200 |
commit | 313e9f63b74419ca14c2c09f581a79c7037ee0e2 (patch) | |
tree | 4dbca4fcd9fd4357a7f1e98cf51a9b14af7676eb /drivers | |
parent | 09524eb8824e102fb57210967bc9d61d7469121c (diff) |
accel/habanalabs: add critical-event bit in notifier
Enhance the existing user notifications by adding a HW and FW critical
event bits to be used when a HW or FW event occur that requires
both SW abort and hard-resetting the chip.
Signed-off-by: Moti Haimovski <mhaimovski@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Reviewed-by: Stanislaw Gruszka <stanislaw.gruszka@linux.intel.com>
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/accel/habanalabs/common/device.c | 53 | ||||
-rw-r--r-- | drivers/accel/habanalabs/common/habanalabs.h | 54 | ||||
-rw-r--r-- | drivers/accel/habanalabs/common/habanalabs_drv.c | 5 | ||||
-rw-r--r-- | drivers/accel/habanalabs/common/habanalabs_ioctl.c | 50 | ||||
-rw-r--r-- | drivers/accel/habanalabs/gaudi/gaudi.c | 10 | ||||
-rw-r--r-- | drivers/accel/habanalabs/gaudi2/gaudi2.c | 4 |
6 files changed, 170 insertions, 6 deletions
diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c index b8c74185eabd..f91f3509336f 100644 --- a/drivers/accel/habanalabs/common/device.c +++ b/drivers/accel/habanalabs/common/device.c @@ -998,6 +998,8 @@ static void hl_device_heartbeat(struct work_struct *work) { struct hl_device *hdev = container_of(work, struct hl_device, work_heartbeat.work); + struct hl_info_fw_err_info info = {0}; + u64 event_mask = HL_NOTIFIER_EVENT_DEVICE_RESET | HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE; if (!hl_device_operational(hdev, NULL)) goto reschedule; @@ -1008,7 +1010,10 @@ static void hl_device_heartbeat(struct work_struct *work) if (hl_device_operational(hdev, NULL)) dev_err(hdev->dev, "Device heartbeat failed!\n"); - hl_device_reset(hdev, HL_DRV_RESET_HARD | HL_DRV_RESET_HEARTBEAT); + info.err_type = HL_INFO_FW_HEARTBEAT_ERR; + info.event_mask = &event_mask; + hl_handle_fw_err(hdev, &info); + hl_device_cond_reset(hdev, HL_DRV_RESET_HARD | HL_DRV_RESET_HEARTBEAT, event_mask); return; @@ -2626,3 +2631,49 @@ void hl_handle_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_ if (event_mask) *event_mask |= HL_NOTIFIER_EVENT_PAGE_FAULT; } + +void hl_capture_hw_err(struct hl_device *hdev, u16 event_id) +{ + struct hw_err_info *info = &hdev->captured_err_info.hw_err; + + /* Capture only the first HW err */ + if (atomic_cmpxchg(&info->event_detected, 0, 1)) + return; + + info->event.timestamp = ktime_to_ns(ktime_get()); + info->event.event_id = event_id; + + info->event_info_available = true; +} + +void hl_handle_critical_hw_err(struct hl_device *hdev, u16 event_id, u64 *event_mask) +{ + hl_capture_hw_err(hdev, event_id); + + if (event_mask) + *event_mask |= HL_NOTIFIER_EVENT_CRITICL_HW_ERR; +} + +void hl_capture_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *fw_info) +{ + struct fw_err_info *info = &hdev->captured_err_info.fw_err; + + /* Capture only the first FW error */ + if (atomic_cmpxchg(&info->event_detected, 0, 1)) + return; + + info->event.timestamp = ktime_to_ns(ktime_get()); + info->event.err_type = fw_info->err_type; + if (fw_info->err_type == HL_INFO_FW_REPORTED_ERR) + info->event.event_id = fw_info->event_id; + + info->event_info_available = true; +} + +void hl_handle_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *info) +{ + hl_capture_fw_err(hdev, info); + + if (info->event_mask) + *info->event_mask |= HL_NOTIFIER_EVENT_CRITICL_FW_ERR; +} diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h index afdae5775eaa..176a2e2c050d 100644 --- a/drivers/accel/habanalabs/common/habanalabs.h +++ b/drivers/accel/habanalabs/common/habanalabs.h @@ -3032,17 +3032,55 @@ struct razwi_info { }; /** + * struct hw_err_info - HW error information. + * @event: holds information on the event. + * @event_detected: if set as 1, then a HW event was discovered for the + * first time after the driver has finished booting-up. + * currently we assume that only fatal events (that require hard-reset) are + * reported so we don't care of the others that might follow it. + * so once changed to 1, it will remain that way. + * TODO: support multiple events. + * @event_info_available: indicates that a HW event info is now available. + */ +struct hw_err_info { + struct hl_info_hw_err_event event; + atomic_t event_detected; + bool event_info_available; +}; + +/** + * struct fw_err_info - FW error information. + * @event: holds information on the event. + * @event_detected: if set as 1, then a FW event was discovered for the + * first time after the driver has finished booting-up. + * currently we assume that only fatal events (that require hard-reset) are + * reported so we don't care of the others that might follow it. + * so once changed to 1, it will remain that way. + * TODO: support multiple events. + * @event_info_available: indicates that a HW event info is now available. + */ +struct fw_err_info { + struct hl_info_fw_err_event event; + atomic_t event_detected; + bool event_info_available; +}; + +/** * struct hl_error_info - holds information collected during an error. * @cs_timeout: CS timeout error information. * @razwi_info: RAZWI information. * @undef_opcode: undefined opcode information. * @page_fault_info: page fault information. + * @hw_err: (fatal) hardware error information. + * @fw_err: firmware error information. */ struct hl_error_info { struct cs_timeout_info cs_timeout; struct razwi_info razwi_info; struct undefined_opcode_info undef_opcode; struct page_fault_info page_fault_info; + struct hw_err_info hw_err; + struct fw_err_info fw_err; }; /** @@ -3453,6 +3491,20 @@ struct hl_cs_encaps_sig_handle { u32 count; }; +/** + * struct hl_info_fw_err_info - firmware error information structure + * @err_type: The type of error detected (or reported). + * @event_mask: Pointer to the event mask to be modified with the detected error flag + * (can be NULL) + * @event_id: The id of the event that reported the error + * (applicable when err_type is HL_INFO_FW_REPORTED_ERR). + */ +struct hl_info_fw_err_info { + enum hl_info_fw_err_type err_type; + u64 *event_mask; + u16 event_id; +}; + /* * IOCTLs */ @@ -3883,6 +3935,8 @@ void hl_handle_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_o void hl_capture_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu); void hl_handle_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu, u64 *event_mask); +void hl_handle_critical_hw_err(struct hl_device *hdev, u16 event_id, u64 *event_mask); +void hl_handle_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *info); #ifdef CONFIG_DEBUG_FS diff --git a/drivers/accel/habanalabs/common/habanalabs_drv.c b/drivers/accel/habanalabs/common/habanalabs_drv.c index 8ccc3d6519b5..0cb6e52a1192 100644 --- a/drivers/accel/habanalabs/common/habanalabs_drv.c +++ b/drivers/accel/habanalabs/common/habanalabs_drv.c @@ -221,12 +221,9 @@ int hl_device_open(struct inode *inode, struct file *filp) hl_debugfs_add_file(hpriv); + memset(&hdev->captured_err_info, 0, sizeof(hdev->captured_err_info)); atomic_set(&hdev->captured_err_info.cs_timeout.write_enable, 1); - atomic_set(&hdev->captured_err_info.razwi_info.razwi_detected, 0); - atomic_set(&hdev->captured_err_info.page_fault_info.page_fault_detected, 0); hdev->captured_err_info.undef_opcode.write_enable = true; - hdev->captured_err_info.razwi_info.razwi_info_available = false; - hdev->captured_err_info.page_fault_info.page_fault_info_available = false; hdev->open_counter++; hdev->last_successful_open_jif = jiffies; diff --git a/drivers/accel/habanalabs/common/habanalabs_ioctl.c b/drivers/accel/habanalabs/common/habanalabs_ioctl.c index 5005e6fca691..13cd5013c72a 100644 --- a/drivers/accel/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/accel/habanalabs/common/habanalabs_ioctl.c @@ -830,6 +830,50 @@ static int user_mappings_info(struct hl_fpriv *hpriv, struct hl_info_args *args) return copy_to_user(out, pgf_info->user_mappings, actual_size) ? -EFAULT : 0; } +static int hw_err_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + void __user *user_buf = (void __user *) (uintptr_t) args->return_pointer; + struct hl_device *hdev = hpriv->hdev; + u32 user_buf_size = args->return_size; + struct hw_err_info *info; + int rc; + + if ((!user_buf_size) || (!user_buf)) + return -EINVAL; + + if (user_buf_size < sizeof(struct hl_info_hw_err_event)) + return -ENOMEM; + + info = &hdev->captured_err_info.hw_err; + if (!info->event_info_available) + return -ENOENT; + + rc = copy_to_user(user_buf, &info->event, sizeof(struct hl_info_hw_err_event)); + return rc ? -EFAULT : 0; +} + +static int fw_err_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + void __user *user_buf = (void __user *) (uintptr_t) args->return_pointer; + struct hl_device *hdev = hpriv->hdev; + u32 user_buf_size = args->return_size; + struct fw_err_info *info; + int rc; + + if ((!user_buf_size) || (!user_buf)) + return -EINVAL; + + if (user_buf_size < sizeof(struct hl_info_fw_err_event)) + return -ENOMEM; + + info = &hdev->captured_err_info.fw_err; + if (!info->event_info_available) + return -ENOENT; + + rc = copy_to_user(user_buf, &info->event, sizeof(struct hl_info_fw_err_event)); + return rc ? -EFAULT : 0; +} + static int send_fw_generic_request(struct hl_device *hdev, struct hl_info_args *info_args) { void __user *buff = (void __user *) (uintptr_t) info_args->return_pointer; @@ -950,6 +994,12 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, case HL_INFO_UNREGISTER_EVENTFD: return eventfd_unregister(hpriv, args); + case HL_INFO_HW_ERR_EVENT: + return hw_err_info(hpriv, args); + + case HL_INFO_FW_ERR_EVENT: + return fw_err_info(hpriv, args); + default: break; } diff --git a/drivers/accel/habanalabs/gaudi/gaudi.c b/drivers/accel/habanalabs/gaudi/gaudi.c index 4ba5352cb7cb..0e02aebb6ea6 100644 --- a/drivers/accel/habanalabs/gaudi/gaudi.c +++ b/drivers/accel/habanalabs/gaudi/gaudi.c @@ -7634,6 +7634,7 @@ static void gaudi_print_clk_change_info(struct hl_device *hdev, u16 event_type, static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry) { struct gaudi_device *gaudi = hdev->asic_specific; + struct hl_info_fw_err_info fw_err_info; u64 data = le64_to_cpu(eq_entry->data[0]), event_mask = 0; u32 ctl = le32_to_cpu(eq_entry->hdr.ctl); u32 fw_fatal_err_flag = 0, flags = 0; @@ -7912,7 +7913,10 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr case GAUDI_EVENT_FW_ALIVE_S: gaudi_print_irq_info(hdev, event_type, false, &event_mask); gaudi_print_fw_alive_info(hdev, &eq_entry->fw_alive); - event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; + fw_err_info.err_type = HL_INFO_FW_REPORTED_ERR; + fw_err_info.event_id = event_type; + fw_err_info.event_mask = &event_mask; + hl_handle_fw_err(hdev, &fw_err_info); goto reset_device; default: @@ -7943,6 +7947,10 @@ reset_device: } if (reset_required) { + /* escalate general hw errors to critical/fatal error */ + if (event_mask & HL_NOTIFIER_EVENT_GENERAL_HW_ERR) + hl_handle_critical_hw_err(hdev, event_type, &event_mask); + hl_device_cond_reset(hdev, flags, event_mask); } else { hl_fw_unmask_irq(hdev, event_type); diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c index d250c6f01acb..6926af5b5ed1 100644 --- a/drivers/accel/habanalabs/gaudi2/gaudi2.c +++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c @@ -9444,6 +9444,10 @@ reset_device: } else { reset_flags |= HL_DRV_RESET_DELAY; } + /* escalate general hw errors to critical/fatal error */ + if (event_mask & HL_NOTIFIER_EVENT_GENERAL_HW_ERR) + hl_handle_critical_hw_err(hdev, event_type, &event_mask); + event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET; hl_device_cond_reset(hdev, reset_flags, event_mask); } |