diff options
author | Dani Liberman <dliberman@habana.ai> | 2022-09-19 18:51:59 +0300 |
---|---|---|
committer | Oded Gabbay <ogabbay@kernel.org> | 2022-11-23 16:13:39 +0200 |
commit | 52d5e5469526216bcc418f26a2796d5af6226023 (patch) | |
tree | 030d47223969c43ebb0b1e45664e410f381ac3f3 /drivers/misc/habanalabs | |
parent | ea73ef14ddf93b8b1ae6ce1963846f43a81bb510 (diff) |
habanalabs: refactor razwi event notification
This event notification was compatible only with gaudi, where razwi
and page fault happens together.
To make it compatible with all ASICs, this refactor contains:
1. Razwi notification will only notify about razwi info.
New notification will be added in future patch, to retrieve data
about page fault error.
2. Changed razwi info structure to support all ASICs.
Signed-off-by: Dani Liberman <dliberman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Diffstat (limited to 'drivers/misc/habanalabs')
-rw-r--r-- | drivers/misc/habanalabs/common/device.c | 22 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/habanalabs.h | 31 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/habanalabs_drv.c | 2 | ||||
-rw-r--r-- | drivers/misc/habanalabs/common/habanalabs_ioctl.c | 12 | ||||
-rw-r--r-- | drivers/misc/habanalabs/gaudi/gaudi.c | 64 |
5 files changed, 61 insertions, 70 deletions
diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 1aaaa2004e34..30ddaaae67e5 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -2253,3 +2253,25 @@ inline void hl_wreg(struct hl_device *hdev, u32 reg, u32 val) { writel(val, hdev->rmmio + reg); } + +void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines, + u8 flags) +{ + if (num_of_engines > HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR) { + dev_err(hdev->dev, + "Number of possible razwi initiators (%u) exceeded limit (%u)\n", + num_of_engines, HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR); + return; + } + + /* In case it's the first razwi since the device was opened, capture its parameters */ + if (atomic_cmpxchg(&hdev->captured_err_info.razwi_info_recorded, 0, 1)) + return; + + hdev->captured_err_info.razwi.timestamp = ktime_to_ns(ktime_get()); + hdev->captured_err_info.razwi.addr = addr; + hdev->captured_err_info.razwi.num_of_possible_engines = num_of_engines; + memcpy(&hdev->captured_err_info.razwi.engine_id[0], &engine_id[0], + num_of_engines * sizeof(u16)); + hdev->captured_err_info.razwi.flags = flags; +} diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 2ffb8378f565..cdc50c2c4de8 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -2925,30 +2925,6 @@ struct cs_timeout_info { u64 seq; }; -/** - * struct razwi_info - info about last razwi error occurred. - * @timestamp: razwi timestamp. - * @write_enable: if set writing to razwi parameters in the structure is enabled. - * otherwise - disabled, so the first (root cause) razwi will not be overwritten. - * @addr: address that caused razwi. - * @engine_id_1: engine id of the razwi initiator, if it was initiated by engine that does - * not have engine id it will be set to U16_MAX. - * @engine_id_2: second engine id of razwi initiator. Might happen that razwi have 2 possible - * engines which one them caused the razwi. In that case, it will contain the - * second possible engine id, otherwise it will be set to U16_MAX. - * @non_engine_initiator: in case the initiator of the razwi does not have engine id. - * @type: cause of razwi, page fault or access error, otherwise it will be set to U8_MAX. - */ -struct razwi_info { - ktime_t timestamp; - atomic_t write_enable; - u64 addr; - u16 engine_id_1; - u16 engine_id_2; - u8 non_engine_initiator; - u8 type; -}; - #define MAX_QMAN_STREAMS_INFO 4 #define OPCODE_INFO_MAX_ADDR_SIZE 8 /** @@ -2985,11 +2961,14 @@ struct undefined_opcode_info { * struct hl_error_info - holds information collected during an error. * @cs_timeout: CS timeout error information. * @razwi: razwi information. + * @razwi_info_recorded: if set writing to razwi information is enabled. + * otherwise - disabled, so the first (root cause) razwi will not be overwritten. * @undef_opcode: undefined opcode information */ struct hl_error_info { struct cs_timeout_info cs_timeout; - struct razwi_info razwi; + struct hl_info_razwi_event razwi; + atomic_t razwi_info_recorded; struct undefined_opcode_info undef_opcode; }; @@ -3800,6 +3779,8 @@ hl_mmap_mem_buf_alloc(struct hl_mem_mgr *mmg, struct hl_mmap_mem_buf_behavior *behavior, gfp_t gfp, void *args); __printf(2, 3) void hl_engine_data_sprintf(struct engines_data *e, const char *fmt, ...); +void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines, + u8 flags); #ifdef CONFIG_DEBUG_FS diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c index 3ee44ea58d5c..d87434b9bc16 100644 --- a/drivers/misc/habanalabs/common/habanalabs_drv.c +++ b/drivers/misc/habanalabs/common/habanalabs_drv.c @@ -212,7 +212,7 @@ int hl_device_open(struct inode *inode, struct file *filp) hl_debugfs_add_file(hpriv); atomic_set(&hdev->captured_err_info.cs_timeout.write_enable, 1); - atomic_set(&hdev->captured_err_info.razwi.write_enable, 1); + atomic_set(&hdev->captured_err_info.razwi_info_recorded, 0); hdev->captured_err_info.undef_opcode.write_enable = true; hdev->open_counter++; diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c index 43afe40966e5..6aef4e24d122 100644 --- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c @@ -603,20 +603,14 @@ static int razwi_info(struct hl_fpriv *hpriv, struct hl_info_args *args) { struct hl_device *hdev = hpriv->hdev; u32 max_size = args->return_size; - struct hl_info_razwi_event info = {0}; + struct hl_info_razwi_event *info = &hdev->captured_err_info.razwi; void __user *out = (void __user *) (uintptr_t) args->return_pointer; if ((!max_size) || (!out)) return -EINVAL; - info.timestamp = ktime_to_ns(hdev->captured_err_info.razwi.timestamp); - info.addr = hdev->captured_err_info.razwi.addr; - info.engine_id_1 = hdev->captured_err_info.razwi.engine_id_1; - info.engine_id_2 = hdev->captured_err_info.razwi.engine_id_2; - info.no_engine_id = hdev->captured_err_info.razwi.non_engine_initiator; - info.error_type = hdev->captured_err_info.razwi.type; - - return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0; + return copy_to_user(out, info, min_t(size_t, max_size, sizeof(struct hl_info_razwi_event))) + ? -EFAULT : 0; } static int undefined_opcode_info(struct hl_fpriv *hpriv, struct hl_info_args *args) diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 92560414e843..f856ac51fde1 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -6505,8 +6505,8 @@ event_not_supported: } static const char *gaudi_get_razwi_initiator_dma_name(struct hl_device *hdev, u32 x_y, - bool is_write, s32 *engine_id_1, - s32 *engine_id_2) + bool is_write, u16 *engine_id_1, + u16 *engine_id_2) { u32 dma_id[2], dma_offset, err_cause[2], mask, i; @@ -6603,7 +6603,7 @@ unknown_initiator: } static const char *gaudi_get_razwi_initiator_name(struct hl_device *hdev, bool is_write, - u32 *engine_id_1, u32 *engine_id_2) + u16 *engine_id_1, u16 *engine_id_2) { u32 val, x_y, axi_id; @@ -6719,8 +6719,8 @@ static const char *gaudi_get_razwi_initiator_name(struct hl_device *hdev, bool i return "unknown initiator"; } -static void gaudi_print_and_get_razwi_info(struct hl_device *hdev, u32 *engine_id_1, - u32 *engine_id_2) +static void gaudi_print_and_get_razwi_info(struct hl_device *hdev, u16 *engine_id_1, + u16 *engine_id_2, bool *is_read, bool *is_write) { if (RREG32(mmMMU_UP_RAZWI_WRITE_VLD)) { @@ -6728,6 +6728,7 @@ static void gaudi_print_and_get_razwi_info(struct hl_device *hdev, u32 *engine_i "RAZWI event caused by illegal write of %s\n", gaudi_get_razwi_initiator_name(hdev, true, engine_id_1, engine_id_2)); WREG32(mmMMU_UP_RAZWI_WRITE_VLD, 0); + *is_write = true; } if (RREG32(mmMMU_UP_RAZWI_READ_VLD)) { @@ -6735,10 +6736,11 @@ static void gaudi_print_and_get_razwi_info(struct hl_device *hdev, u32 *engine_i "RAZWI event caused by illegal read of %s\n", gaudi_get_razwi_initiator_name(hdev, false, engine_id_1, engine_id_2)); WREG32(mmMMU_UP_RAZWI_READ_VLD, 0); + *is_read = true; } } -static void gaudi_print_and_get_mmu_error_info(struct hl_device *hdev, u64 *addr, u8 *type) +static void gaudi_print_and_get_mmu_error_info(struct hl_device *hdev, u64 *addr) { struct gaudi_device *gaudi = hdev->asic_specific; u32 val; @@ -6753,8 +6755,6 @@ static void gaudi_print_and_get_mmu_error_info(struct hl_device *hdev, u64 *addr *addr |= RREG32(mmMMU_UP_PAGE_ERROR_CAPTURE_VA); dev_err_ratelimited(hdev->dev, "MMU page fault on va 0x%llx\n", *addr); - *type = HL_RAZWI_PAGE_FAULT; - WREG32(mmMMU_UP_PAGE_ERROR_CAPTURE, 0); } @@ -6765,7 +6765,6 @@ static void gaudi_print_and_get_mmu_error_info(struct hl_device *hdev, u64 *addr *addr |= RREG32(mmMMU_UP_ACCESS_ERROR_CAPTURE_VA); dev_err_ratelimited(hdev->dev, "MMU access error on va 0x%llx\n", *addr); - *type = HL_RAZWI_MMU_ACCESS_ERROR; WREG32(mmMMU_UP_ACCESS_ERROR_CAPTURE, 0); } @@ -7302,46 +7301,41 @@ static void gaudi_handle_qman_err(struct hl_device *hdev, u16 event_type, u64 *e static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type, bool razwi) { - u32 engine_id_1, engine_id_2; + bool is_read = false, is_write = false; + u16 engine_id[2], num_of_razwi_eng = 0; char desc[64] = ""; u64 razwi_addr = 0; - u8 razwi_type; - int rc; + u8 razwi_flags = 0; /* * Init engine id by default as not valid and only if razwi initiated from engine with * engine id it will get valid value. - * Init razwi type to default, will be changed only if razwi caused by page fault of - * MMU access error */ - engine_id_1 = U16_MAX; - engine_id_2 = U16_MAX; - razwi_type = U8_MAX; + engine_id[0] = HL_RAZWI_NA_ENG_ID; + engine_id[1] = HL_RAZWI_NA_ENG_ID; gaudi_get_event_desc(event_type, desc, sizeof(desc)); dev_err_ratelimited(hdev->dev, "Received H/W interrupt %d [\"%s\"]\n", event_type, desc); if (razwi) { - gaudi_print_and_get_razwi_info(hdev, &engine_id_1, &engine_id_2); - gaudi_print_and_get_mmu_error_info(hdev, &razwi_addr, &razwi_type); - - /* In case it's the first razwi, save its parameters*/ - rc = atomic_cmpxchg(&hdev->captured_err_info.razwi.write_enable, 1, 0); - if (rc) { - hdev->captured_err_info.razwi.timestamp = ktime_get(); - hdev->captured_err_info.razwi.addr = razwi_addr; - hdev->captured_err_info.razwi.engine_id_1 = engine_id_1; - hdev->captured_err_info.razwi.engine_id_2 = engine_id_2; - /* - * If first engine id holds non valid value the razwi initiator - * does not have engine id - */ - hdev->captured_err_info.razwi.non_engine_initiator = - (engine_id_1 == U16_MAX); - hdev->captured_err_info.razwi.type = razwi_type; - + gaudi_print_and_get_razwi_info(hdev, &engine_id[0], &engine_id[1], &is_read, + &is_write); + gaudi_print_and_get_mmu_error_info(hdev, &razwi_addr); + + if (is_read) + razwi_flags |= HL_RAZWI_READ; + if (is_write) + razwi_flags |= HL_RAZWI_WRITE; + + if (engine_id[0] != HL_RAZWI_NA_ENG_ID) { + if (engine_id[1] != HL_RAZWI_NA_ENG_ID) + num_of_razwi_eng = 2; + else + num_of_razwi_eng = 1; } + + hl_capture_razwi(hdev, razwi_addr, engine_id, num_of_razwi_eng, razwi_flags); } } |