summaryrefslogtreecommitdiff
path: root/drivers/misc/habanalabs/include
diff options
context:
space:
mode:
authorOded Gabbay <ogabbay@kernel.org>2022-06-26 18:20:03 +0300
committerOded Gabbay <ogabbay@kernel.org>2022-07-12 09:09:27 +0300
commitd7bb1ac89b2f5e230ec87659dabd2600897e49ef (patch)
treee3874b2d93e24b856caaffe9b7545fcf4b85f2ef /drivers/misc/habanalabs/include
parent97c6d22fa4bd54cccff39e862893327eef1bbfa8 (diff)
habanalabs: add gaudi2 asic-specific code
Add the ASIC-specific code for Gaudi2. Supply (almost) all of the function callbacks that the driver's common code need to initialize, finalize and submit workloads to the Gaudi2 ASIC. It also contains the code to initialize the F/W of the Gaudi2 ASIC and to receive events from the F/W. It contains new debugfs entry to dump razwi events. razwi is a case where the device's engines create a transaction that reaches an invalid destination. Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Diffstat (limited to 'drivers/misc/habanalabs/include')
-rw-r--r--drivers/misc/habanalabs/include/common/cpucp_if.h294
-rw-r--r--drivers/misc/habanalabs/include/common/hl_boot_if.h7
2 files changed, 284 insertions, 17 deletions
diff --git a/drivers/misc/habanalabs/include/common/cpucp_if.h b/drivers/misc/habanalabs/include/common/cpucp_if.h
index b190a44ef2e2..719b2ff80985 100644
--- a/drivers/misc/habanalabs/include/common/cpucp_if.h
+++ b/drivers/misc/habanalabs/include/common/cpucp_if.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0
*
- * Copyright 2020-2021 HabanaLabs, Ltd.
+ * Copyright 2020-2022 HabanaLabs, Ltd.
* All Rights Reserved.
*
*/
@@ -99,27 +99,265 @@ struct hl_eq_fw_alive {
__u8 pad[7];
};
-enum hl_pcie_addr_dec_cause {
- PCIE_ADDR_DEC_HBW_ERR_RESP,
- PCIE_ADDR_DEC_LBW_ERR_RESP,
- PCIE_ADDR_DEC_TLP_BLOCKED_BY_RR
+struct hl_eq_intr_cause {
+ __le64 intr_cause_data;
};
-struct hl_eq_pcie_addr_dec_data {
- /* enum hl_pcie_addr_dec_cause */
- __u8 addr_dec_cause;
- __u8 pad[7];
+struct hl_eq_pcie_drain_ind_data {
+ struct hl_eq_intr_cause intr_cause;
+ __le64 drain_wr_addr_lbw;
+ __le64 drain_rd_addr_lbw;
+ __le64 drain_wr_addr_hbw;
+ __le64 drain_rd_addr_hbw;
+};
+
+struct hl_eq_razwi_lbw_info_regs {
+ __le32 rr_aw_razwi_reg;
+ __le32 rr_aw_razwi_id_reg;
+ __le32 rr_ar_razwi_reg;
+ __le32 rr_ar_razwi_id_reg;
+};
+
+struct hl_eq_razwi_hbw_info_regs {
+ __le32 rr_aw_razwi_hi_reg;
+ __le32 rr_aw_razwi_lo_reg;
+ __le32 rr_aw_razwi_id_reg;
+ __le32 rr_ar_razwi_hi_reg;
+ __le32 rr_ar_razwi_lo_reg;
+ __le32 rr_ar_razwi_id_reg;
+};
+
+/* razwi_happened masks */
+#define RAZWI_HAPPENED_HBW 0x1
+#define RAZWI_HAPPENED_LBW 0x2
+#define RAZWI_HAPPENED_AW 0x4
+#define RAZWI_HAPPENED_AR 0x8
+
+struct hl_eq_razwi_info {
+ __le32 razwi_happened_mask;
+ union {
+ struct hl_eq_razwi_lbw_info_regs lbw;
+ struct hl_eq_razwi_hbw_info_regs hbw;
+ };
+ __le32 pad;
+};
+
+struct hl_eq_razwi_with_intr_cause {
+ struct hl_eq_razwi_info razwi_info;
+ struct hl_eq_intr_cause intr_cause;
+};
+
+#define HBM_CA_ERR_CMD_LIFO_LEN 8
+#define HBM_RD_ERR_DATA_LIFO_LEN 8
+#define HBM_WR_PAR_CMD_LIFO_LEN 11
+
+enum hl_hbm_sei_cause {
+ /* Command/address parity error event is split into 2 events due to
+ * size limitation: ODD suffix for odd HBM CK_t cycles and EVEN suffix
+ * for even HBM CK_t cycles
+ */
+ HBM_SEI_CMD_PARITY_EVEN,
+ HBM_SEI_CMD_PARITY_ODD,
+ /* Read errors can be reflected as a combination of SERR/DERR/parity
+ * errors. Therefore, we define one event for all read error types.
+ * LKD will perform further proccessing.
+ */
+ HBM_SEI_READ_ERR,
+ HBM_SEI_WRITE_DATA_PARITY_ERR,
+ HBM_SEI_CATTRIP,
+ HBM_SEI_MEM_BIST_FAIL,
+ HBM_SEI_DFI,
+ HBM_SEI_INV_TEMP_READ_OUT,
+ HBM_SEI_BIST_FAIL,
+};
+
+/* Masks for parsing hl_hbm_sei_headr fields */
+#define HBM_ECC_SERR_CNTR_MASK 0xFF
+#define HBM_ECC_DERR_CNTR_MASK 0xFF00
+#define HBM_RD_PARITY_CNTR_MASK 0xFF0000
+
+/* HBM index and MC index are known by the event_id */
+struct hl_hbm_sei_header {
+ union {
+ /* relevant only in case of HBM read error */
+ struct {
+ __u8 ecc_serr_cnt;
+ __u8 ecc_derr_cnt;
+ __u8 read_par_cnt;
+ __u8 reserved;
+ };
+ /* All other cases */
+ __le32 cnt;
+ };
+ __u8 sei_cause; /* enum hl_hbm_sei_cause */
+ __u8 mc_channel; /* range: 0-3 */
+ __u8 mc_pseudo_channel; /* range: 0-7 */
+ __u8 pad[1];
+};
+
+#define HBM_RD_ADDR_SID_SHIFT 0
+#define HBM_RD_ADDR_SID_MASK 0x1
+#define HBM_RD_ADDR_BG_SHIFT 1
+#define HBM_RD_ADDR_BG_MASK 0x6
+#define HBM_RD_ADDR_BA_SHIFT 3
+#define HBM_RD_ADDR_BA_MASK 0x18
+#define HBM_RD_ADDR_COL_SHIFT 5
+#define HBM_RD_ADDR_COL_MASK 0x7E0
+#define HBM_RD_ADDR_ROW_SHIFT 11
+#define HBM_RD_ADDR_ROW_MASK 0x3FFF800
+
+struct hbm_rd_addr {
+ union {
+ /* bit fields are only for FW use */
+ struct {
+ u32 dbg_rd_err_addr_sid:1;
+ u32 dbg_rd_err_addr_bg:2;
+ u32 dbg_rd_err_addr_ba:2;
+ u32 dbg_rd_err_addr_col:6;
+ u32 dbg_rd_err_addr_row:15;
+ u32 reserved:6;
+ };
+ __le32 rd_addr_val;
+ };
+};
+
+#define HBM_RD_ERR_BEAT_SHIFT 2
+/* dbg_rd_err_misc fields: */
+/* Read parity is calculated per DW on every beat */
+#define HBM_RD_ERR_PAR_ERR_BEAT0_SHIFT 0
+#define HBM_RD_ERR_PAR_ERR_BEAT0_MASK 0x3
+#define HBM_RD_ERR_PAR_DATA_BEAT0_SHIFT 8
+#define HBM_RD_ERR_PAR_DATA_BEAT0_MASK 0x300
+/* ECC is calculated per PC on every beat */
+#define HBM_RD_ERR_SERR_BEAT0_SHIFT 16
+#define HBM_RD_ERR_SERR_BEAT0_MASK 0x10000
+#define HBM_RD_ERR_DERR_BEAT0_SHIFT 24
+#define HBM_RD_ERR_DERR_BEAT0_MASK 0x100000
+
+struct hl_eq_hbm_sei_read_err_intr_info {
+ /* DFI_RD_ERR_REP_ADDR */
+ struct hbm_rd_addr dbg_rd_err_addr;
+ /* DFI_RD_ERR_REP_ERR */
+ union {
+ struct {
+ /* bit fields are only for FW use */
+ u32 dbg_rd_err_par:8;
+ u32 dbg_rd_err_par_data:8;
+ u32 dbg_rd_err_serr:4;
+ u32 dbg_rd_err_derr:4;
+ u32 reserved:8;
+ };
+ __le32 dbg_rd_err_misc;
+ };
+ /* DFI_RD_ERR_REP_DM */
+ __le32 dbg_rd_err_dm;
+ /* DFI_RD_ERR_REP_SYNDROME */
+ __le32 dbg_rd_err_syndrome;
+ /* DFI_RD_ERR_REP_DATA */
+ __le32 dbg_rd_err_data[HBM_RD_ERR_DATA_LIFO_LEN];
+};
+
+struct hl_eq_hbm_sei_ca_par_intr_info {
+ /* 14 LSBs */
+ __le16 dbg_row[HBM_CA_ERR_CMD_LIFO_LEN];
+ /* 18 LSBs */
+ __le32 dbg_col[HBM_CA_ERR_CMD_LIFO_LEN];
+};
+
+#define WR_PAR_LAST_CMD_COL_SHIFT 0
+#define WR_PAR_LAST_CMD_COL_MASK 0x3F
+#define WR_PAR_LAST_CMD_BG_SHIFT 6
+#define WR_PAR_LAST_CMD_BG_MASK 0xC0
+#define WR_PAR_LAST_CMD_BA_SHIFT 8
+#define WR_PAR_LAST_CMD_BA_MASK 0x300
+#define WR_PAR_LAST_CMD_SID_SHIFT 10
+#define WR_PAR_LAST_CMD_SID_MASK 0x400
+
+/* Row address isn't latched */
+struct hbm_sei_wr_cmd_address {
+ /* DFI_DERR_LAST_CMD */
+ union {
+ struct {
+ /* bit fields are only for FW use */
+ u32 col:6;
+ u32 bg:2;
+ u32 ba:2;
+ u32 sid:1;
+ u32 reserved:21;
+ };
+ __le32 dbg_wr_cmd_addr;
+ };
+};
+
+struct hl_eq_hbm_sei_wr_par_intr_info {
+ /* entry 0: WR command address from the 1st cycle prior to the error
+ * entry 1: WR command address from the 2nd cycle prior to the error
+ * and so on...
+ */
+ struct hbm_sei_wr_cmd_address dbg_last_wr_cmds[HBM_WR_PAR_CMD_LIFO_LEN];
+ /* derr[0:1] - 1st HBM cycle DERR output
+ * derr[2:3] - 2nd HBM cycle DERR output
+ */
+ __u8 dbg_derr;
+ /* extend to reach 8B */
+ __u8 pad[3];
+};
+
+/*
+ * this struct represents the following sei causes:
+ * command parity, ECC double error, ECC single error, dfi error, cattrip,
+ * temperature read-out, read parity error and write parity error.
+ * some only use the header while some have extra data.
+ */
+struct hl_eq_hbm_sei_data {
+ struct hl_hbm_sei_header hdr;
+ union {
+ struct hl_eq_hbm_sei_ca_par_intr_info ca_parity_even_info;
+ struct hl_eq_hbm_sei_ca_par_intr_info ca_parity_odd_info;
+ struct hl_eq_hbm_sei_read_err_intr_info read_err_info;
+ struct hl_eq_hbm_sei_wr_par_intr_info wr_parity_info;
+ };
+};
+
+/* Engine/farm arc interrupt type */
+enum hl_engine_arc_interrupt_type {
+ /* Qman/farm ARC DCCM QUEUE FULL interrupt type */
+ ENGINE_ARC_DCCM_QUEUE_FULL_IRQ = 1
+};
+
+/* Data structure specifies details of payload of DCCM QUEUE FULL interrupt */
+struct hl_engine_arc_dccm_queue_full_irq {
+ /* Queue index value which caused DCCM QUEUE FULL */
+ __le32 queue_index;
+ __le32 pad;
+};
+
+/* Data structure specifies details of QM/FARM ARC interrupt */
+struct hl_eq_engine_arc_intr_data {
+ /* ARC engine id e.g. DCORE0_TPC0_QM_ARC, DCORE0_TCP1_QM_ARC */
+ __le32 engine_id;
+ __le32 intr_type; /* enum hl_engine_arc_interrupt_type */
+ /* More info related to the interrupt e.g. queue index
+ * incase of DCCM_QUEUE_FULL interrupt.
+ */
+ __le64 payload;
+ __le64 pad[5];
};
struct hl_eq_entry {
struct hl_eq_header hdr;
union {
struct hl_eq_ecc_data ecc_data;
- struct hl_eq_hbm_ecc_data hbm_ecc_data;
+ struct hl_eq_hbm_ecc_data hbm_ecc_data; /* Gaudi1 HBM */
struct hl_eq_sm_sei_data sm_sei_data;
struct cpucp_pkt_sync_err pkt_sync_err;
struct hl_eq_fw_alive fw_alive;
- struct hl_eq_pcie_addr_dec_data pcie_addr_dec_data;
+ struct hl_eq_intr_cause intr_cause;
+ struct hl_eq_pcie_drain_ind_data pcie_drain_ind_data;
+ struct hl_eq_razwi_info razwi_info;
+ struct hl_eq_razwi_with_intr_cause razwi_with_intr_cause;
+ struct hl_eq_hbm_sei_data sei_data; /* Gaudi2 HBM */
+ struct hl_eq_engine_arc_intr_data arc_data;
__le64 data[7];
};
};
@@ -793,10 +1031,23 @@ struct cpucp_security_info {
* @infineon_second_stage_version: Infineon 2nd stage DC-DC version.
* @dram_size: available DRAM size.
* @card_name: card name that will be displayed in HWMON subsystem on the host
+ * @tpc_binning_mask: TPC binning mask, 1 bit per TPC instance
+ * (0 = functional, 1 = binned)
+ * @decoder_binning_mask: Decoder binning mask, 1 bit per decoder instance
+ * (0 = functional, 1 = binned), maximum 1 per dcore
+ * @sram_binning: Categorize SRAM functionality
+ * (0 = fully functional, 1 = lower-half is not functional,
+ * 2 = upper-half is not functional)
* @sec_info: security information
* @pll_map: Bit map of supported PLLs for current ASIC version.
* @mme_binning_mask: MME binning mask,
- * (0 = functional, 1 = binned)
+ * bits [0:6] <==> dcore0 mme fma
+ * bits [7:13] <==> dcore1 mme fma
+ * bits [14:20] <==> dcore0 mme ima
+ * bits [21:27] <==> dcore1 mme ima
+ * For each group, if the 6th bit is set then first 5 bits
+ * represent the col's idx [0-31], otherwise these bits are
+ * ignored, and col idx 32 is binned. 7th bit is don't care.
* @dram_binning_mask: DRAM binning mask, 1 bit per dram instance
* (0 = functional 1 = binned)
* @memory_repair_flag: eFuse flag indicating memory repair
@@ -804,6 +1055,8 @@ struct cpucp_security_info {
* (0 = functional 1 = binned)
* @xbar_binning_mask: Xbar binning mask, 1 bit per Xbar instance
* (0 = functional 1 = binned)
+ * @interposer_version: Interposer version programmed in eFuse
+ * @substrate_version: Substrate version programmed in eFuse
* @fw_os_version: Firmware OS Version
*/
struct cpucp_info {
@@ -820,16 +1073,18 @@ struct cpucp_info {
__le32 infineon_second_stage_version;
__le64 dram_size;
char card_name[CARD_NAME_MAX_LEN];
- __le64 reserved3;
- __le64 reserved4;
- __u8 reserved5;
+ __le64 tpc_binning_mask;
+ __le64 decoder_binning_mask;
+ __u8 sram_binning;
__u8 dram_binning_mask;
__u8 memory_repair_flag;
__u8 edma_binning_mask;
__u8 xbar_binning_mask;
- __u8 pad[3];
+ __u8 interposer_version;
+ __u8 substrate_version;
+ __u8 reserved2;
struct cpucp_security_info sec_info;
- __le32 reserved6;
+ __le32 reserved3;
__u8 pll_map[PLL_MAP_LEN];
__le64 mme_binning_mask;
__u8 fw_os_version[VERSION_MAX_LEN];
@@ -933,6 +1188,11 @@ struct cpucp_hbm_row_replaced_rows_info {
struct cpucp_hbm_row_info replaced_rows[CPUCP_HBM_ROW_REPLACE_MAX];
};
+enum cpu_reset_status {
+ CPU_RST_STATUS_NA = 0,
+ CPU_RST_STATUS_SOFT_RST_DONE = 1,
+};
+
/*
* struct dcore_monitor_regs_data - DCORE monitor regs data.
* the structure follows sync manager block layout. relevant only to Gaudi.
diff --git a/drivers/misc/habanalabs/include/common/hl_boot_if.h b/drivers/misc/habanalabs/include/common/hl_boot_if.h
index 15f91ae9de6e..a3594119bc51 100644
--- a/drivers/misc/habanalabs/include/common/hl_boot_if.h
+++ b/drivers/misc/habanalabs/include/common/hl_boot_if.h
@@ -525,6 +525,13 @@ struct lkd_fw_comms_msg {
struct {
__u8 fw_cfg_skip; /* 1 - skip, 0 - don't skip */
};
+ struct {
+ __le64 tpc_binning_conf;
+ __le32 dec_binning_conf;
+ __le32 hbm_binning_conf;
+ __le32 edma_binning_conf;
+ __le32 mme_redundancy_conf; /* use MME_REDUNDANT_COLUMN */
+ };
};
};