summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h194
1 files changed, 176 insertions, 18 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 76fb85628716..82db986c36a0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -26,12 +26,33 @@
#include <linux/debugfs.h>
#include <linux/list.h>
+#include <linux/kfifo.h>
+#include <linux/radix-tree.h>
#include "ta_ras_if.h"
#include "amdgpu_ras_eeprom.h"
#include "amdgpu_smuio.h"
+#include "amdgpu_aca.h"
struct amdgpu_iv_entry;
+#define AMDGPU_RAS_GPU_ERR_MEM_TRAINING(x) AMDGPU_GET_REG_FIELD(x, 0, 0)
+#define AMDGPU_RAS_GPU_ERR_FW_LOAD(x) AMDGPU_GET_REG_FIELD(x, 1, 1)
+#define AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(x) AMDGPU_GET_REG_FIELD(x, 2, 2)
+#define AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(x) AMDGPU_GET_REG_FIELD(x, 3, 3)
+#define AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(x) AMDGPU_GET_REG_FIELD(x, 4, 4)
+#define AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(x) AMDGPU_GET_REG_FIELD(x, 5, 5)
+#define AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(x) AMDGPU_GET_REG_FIELD(x, 6, 6)
+#define AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(x) AMDGPU_GET_REG_FIELD(x, 7, 7)
+#define AMDGPU_RAS_GPU_ERR_SOCKET_ID(x) AMDGPU_GET_REG_FIELD(x, 10, 8)
+#define AMDGPU_RAS_GPU_ERR_AID_ID(x) AMDGPU_GET_REG_FIELD(x, 12, 11)
+#define AMDGPU_RAS_GPU_ERR_HBM_ID(x) AMDGPU_GET_REG_FIELD(x, 14, 13)
+#define AMDGPU_RAS_GPU_ERR_DATA_ABORT(x) AMDGPU_GET_REG_FIELD(x, 29, 29)
+#define AMDGPU_RAS_GPU_ERR_UNKNOWN(x) AMDGPU_GET_REG_FIELD(x, 30, 30)
+
+#define AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT 100
+#define AMDGPU_RAS_BOOT_STEADY_STATUS 0xBA
+#define AMDGPU_RAS_BOOT_STATUS_MASK 0xFF
+
#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS (0x1 << 0)
/* position of instance value in sub_block_index of
* ta_ras_trigger_error_input, the sub block uses lower 12 bits
@@ -39,6 +60,24 @@ struct amdgpu_iv_entry;
#define AMDGPU_RAS_INST_MASK 0xfffff000
#define AMDGPU_RAS_INST_SHIFT 0xc
+#define AMDGPU_RAS_FEATURES_SOCKETID_SHIFT 29
+#define AMDGPU_RAS_FEATURES_SOCKETID_MASK 0xe0000000
+
+/* Reserve 8 physical dram row for possible retirement.
+ * In worst cases, it will lose 8 * 2MB memory in vram domain */
+#define AMDGPU_RAS_RESERVED_VRAM_SIZE (16ULL << 20)
+/* The high three bits indicates socketid */
+#define AMDGPU_RAS_GET_FEATURES(val) ((val) & ~AMDGPU_RAS_FEATURES_SOCKETID_MASK)
+
+#define RAS_EVENT_INVALID_ID (BIT_ULL(63))
+#define RAS_EVENT_ID_IS_VALID(x) (!((x) & BIT_ULL(63)))
+
+#define RAS_EVENT_LOG(adev, id, fmt, ...) \
+ amdgpu_ras_event_log_print((adev), (id), (fmt), ##__VA_ARGS__)
+
+#define amdgpu_ras_mark_ras_event(adev, type) \
+ (amdgpu_ras_mark_ras_event_caller((adev), (type), __builtin_return_address(0)))
+
enum amdgpu_ras_block {
AMDGPU_RAS_BLOCK__UMC = 0,
AMDGPU_RAS_BLOCK__SDMA,
@@ -57,8 +96,11 @@ enum amdgpu_ras_block {
AMDGPU_RAS_BLOCK__MCA,
AMDGPU_RAS_BLOCK__VCN,
AMDGPU_RAS_BLOCK__JPEG,
+ AMDGPU_RAS_BLOCK__IH,
+ AMDGPU_RAS_BLOCK__MPIO,
- AMDGPU_RAS_BLOCK__LAST
+ AMDGPU_RAS_BLOCK__LAST,
+ AMDGPU_RAS_BLOCK__ANY = -1
};
enum amdgpu_ras_mca_block {
@@ -324,6 +366,7 @@ enum amdgpu_ras_error_query_mode {
AMDGPU_RAS_INVALID_ERROR_QUERY = 0,
AMDGPU_RAS_DIRECT_ERROR_QUERY = 1,
AMDGPU_RAS_FIRMWARE_ERROR_QUERY = 2,
+ AMDGPU_RAS_VIRT_ERROR_COUNT_QUERY = 3,
};
/* ras error status reisger fields */
@@ -392,6 +435,66 @@ struct umc_ecc_info {
int record_ce_addr_supported;
};
+enum ras_event_type {
+ RAS_EVENT_TYPE_INVALID = 0,
+ RAS_EVENT_TYPE_FATAL,
+ RAS_EVENT_TYPE_POISON_CREATION,
+ RAS_EVENT_TYPE_POISON_CONSUMPTION,
+ RAS_EVENT_TYPE_COUNT,
+};
+
+struct ras_event_state {
+ u64 last_seqno;
+ atomic64_t count;
+};
+
+struct ras_event_manager {
+ atomic64_t seqno;
+ struct ras_event_state event_state[RAS_EVENT_TYPE_COUNT];
+};
+
+struct ras_event_id {
+ enum ras_event_type type;
+ u64 event_id;
+};
+
+struct ras_query_context {
+ struct ras_event_id evid;
+};
+
+typedef int (*pasid_notify)(struct amdgpu_device *adev,
+ uint16_t pasid, void *data);
+
+struct ras_poison_msg {
+ enum amdgpu_ras_block block;
+ uint16_t pasid;
+ uint32_t reset;
+ pasid_notify pasid_fn;
+ void *data;
+};
+
+struct ras_err_pages {
+ uint32_t count;
+ uint64_t *pfn;
+};
+
+struct ras_ecc_err {
+ uint64_t status;
+ uint64_t ipid;
+ uint64_t addr;
+ uint64_t pa_pfn;
+ /* save global channel index across all UMC instances */
+ uint32_t channel_idx;
+ struct ras_err_pages err_pages;
+};
+
+struct ras_ecc_log_info {
+ struct mutex lock;
+ struct radix_tree_root de_page_tree;
+ uint64_t de_queried_count;
+ uint64_t prev_de_queried_count;
+};
+
struct amdgpu_ras {
/* ras infrastructure */
/* for ras itself. */
@@ -402,6 +505,7 @@ struct amdgpu_ras {
struct device_attribute features_attr;
struct device_attribute version_attr;
struct device_attribute schema_attr;
+ struct device_attribute event_state_attr;
struct bin_attribute badpages_attr;
struct dentry *de_ras_eeprom_table;
/* block array */
@@ -441,10 +545,30 @@ struct amdgpu_ras {
/* Indicates smu whether need update bad channel info */
bool update_channel_flag;
/* Record status of smu mca debug mode */
- bool is_mca_debug_mode;
+ bool is_aca_debug_mode;
+ bool is_rma;
/* Record special requirements of gpu reset caller */
uint32_t gpu_reset_flags;
+
+ struct task_struct *page_retirement_thread;
+ wait_queue_head_t page_retirement_wq;
+ struct mutex page_retirement_lock;
+ atomic_t page_retirement_req_cnt;
+ atomic_t poison_creation_count;
+ struct mutex page_rsv_lock;
+ DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128);
+ struct ras_ecc_log_info umc_ecc_log;
+ struct delayed_work page_retirement_dwork;
+
+ /* ras errors detected */
+ unsigned long ras_err_state;
+
+ /* RAS event manager */
+ struct ras_event_manager __event_mgr;
+ struct ras_event_manager *event_mgr;
+
+ uint64_t reserved_pages_in_bytes;
};
struct ras_fs_data {
@@ -452,17 +576,11 @@ struct ras_fs_data {
char debugfs_name[32];
};
-struct ras_err_addr {
- uint64_t err_status;
- uint64_t err_ipid;
- uint64_t err_addr;
-};
-
struct ras_err_info {
struct amdgpu_smuio_mcm_config_info mcm_info;
u64 ce_count;
u64 ue_count;
- struct ras_err_addr err_addr;
+ u64 de_count;
};
struct ras_err_node {
@@ -473,8 +591,10 @@ struct ras_err_node {
struct ras_err_data {
unsigned long ue_count;
unsigned long ce_count;
+ unsigned long de_count;
unsigned long err_addr_cnt;
struct eeprom_table_record *err_addr;
+ unsigned long err_addr_len;
u32 err_list_count;
struct list_head err_node_list;
};
@@ -529,6 +649,8 @@ struct ras_manager {
struct ras_ih_data ih_data;
struct ras_err_data err_data;
+
+ struct aca_handle aca_handle;
};
struct ras_badpage {
@@ -548,6 +670,7 @@ struct ras_query_if {
struct ras_common_if head;
unsigned long ue_count;
unsigned long ce_count;
+ unsigned long de_count;
};
struct ras_inject_if {
@@ -617,8 +740,8 @@ struct amdgpu_ras_block_hw_ops {
* 8: feature disable
*/
-
-int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
+int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev);
+int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info);
void amdgpu_ras_resume(struct amdgpu_device *adev);
void amdgpu_ras_suspend(struct amdgpu_device *adev);
@@ -630,7 +753,7 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
/* error handling functions */
int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
- struct eeprom_table_record *bps, int pages);
+ struct eeprom_table_record *bps, int pages, bool from_rom);
int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
unsigned long *new_cnt);
@@ -781,7 +904,8 @@ struct amdgpu_ras* amdgpu_ras_get_context(struct amdgpu_device *adev);
int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con);
int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable);
-bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev);
+int amdgpu_ras_set_aca_debug_mode(struct amdgpu_device *adev, bool enable);
+bool amdgpu_ras_get_aca_debug_mode(struct amdgpu_device *adev);
bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
unsigned int *mode);
@@ -813,10 +937,44 @@ void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev,
int amdgpu_ras_error_data_init(struct ras_err_data *err_data);
void amdgpu_ras_error_data_fini(struct ras_err_data *err_data);
int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
- struct amdgpu_smuio_mcm_config_info *mcm_info,
- struct ras_err_addr *err_addr, u64 count);
+ struct amdgpu_smuio_mcm_config_info *mcm_info,
+ u64 count);
int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
- struct amdgpu_smuio_mcm_config_info *mcm_info,
- struct ras_err_addr *err_addr, u64 count);
-
+ struct amdgpu_smuio_mcm_config_info *mcm_info,
+ u64 count);
+int amdgpu_ras_error_statistic_de_count(struct ras_err_data *err_data,
+ struct amdgpu_smuio_mcm_config_info *mcm_info,
+ u64 count);
+void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances);
+int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk,
+ const struct aca_info *aca_info, void *data);
+int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk);
+
+ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *attr,
+ struct aca_handle *handle, char *buf, void *data);
+
+void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status);
+bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev);
+void amdgpu_ras_set_err_poison(struct amdgpu_device *adev,
+ enum amdgpu_ras_block block);
+void amdgpu_ras_clear_err_state(struct amdgpu_device *adev);
+bool amdgpu_ras_is_err_state(struct amdgpu_device *adev, int block);
+
+u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type);
+int amdgpu_ras_mark_ras_event_caller(struct amdgpu_device *adev, enum ras_event_type type,
+ const void *caller);
+
+int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn);
+
+int amdgpu_ras_put_poison_req(struct amdgpu_device *adev,
+ enum amdgpu_ras_block block, uint16_t pasid,
+ pasid_notify pasid_fn, void *data, uint32_t reset);
+
+bool amdgpu_ras_in_recovery(struct amdgpu_device *adev);
+
+__printf(3, 4)
+void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id,
+ const char *fmt, ...);
+
+bool amdgpu_ras_is_rma(struct amdgpu_device *adev);
#endif