diff options
Diffstat (limited to 'drivers/ras')
| -rw-r--r-- | drivers/ras/amd/atl/Kconfig | 1 | ||||
| -rw-r--r-- | drivers/ras/amd/atl/core.c | 7 | ||||
| -rw-r--r-- | drivers/ras/amd/atl/internal.h | 12 | ||||
| -rw-r--r-- | drivers/ras/amd/atl/prm.c | 4 | ||||
| -rw-r--r-- | drivers/ras/amd/atl/system.c | 30 | ||||
| -rw-r--r-- | drivers/ras/amd/atl/umc.c | 42 | ||||
| -rw-r--r-- | drivers/ras/amd/fmpm.c | 9 | ||||
| -rw-r--r-- | drivers/ras/cec.c | 2 | ||||
| -rw-r--r-- | drivers/ras/ras.c | 41 |
9 files changed, 109 insertions, 39 deletions
diff --git a/drivers/ras/amd/atl/Kconfig b/drivers/ras/amd/atl/Kconfig index 551680073e43..6e03942cd7da 100644 --- a/drivers/ras/amd/atl/Kconfig +++ b/drivers/ras/amd/atl/Kconfig @@ -10,6 +10,7 @@ config AMD_ATL tristate "AMD Address Translation Library" depends on AMD_NB && X86_64 && RAS + depends on AMD_NODE depends on MEMORY_FAILURE default N help diff --git a/drivers/ras/amd/atl/core.c b/drivers/ras/amd/atl/core.c index 4197e10993ac..0f7cd6dab0b0 100644 --- a/drivers/ras/amd/atl/core.c +++ b/drivers/ras/amd/atl/core.c @@ -194,6 +194,8 @@ MODULE_DEVICE_TABLE(x86cpu, amd_atl_cpuids); static int __init amd_atl_init(void) { + int ret; + if (!x86_match_cpu(amd_atl_cpuids)) return -ENODEV; @@ -202,8 +204,9 @@ static int __init amd_atl_init(void) check_for_legacy_df_access(); - if (get_df_system_info()) - return -ENODEV; + ret = get_df_system_info(); + if (ret) + return ret; /* Increment this module's recount so that it can't be easily unloaded. */ __module_get(THIS_MODULE); diff --git a/drivers/ras/amd/atl/internal.h b/drivers/ras/amd/atl/internal.h index 143d04c779a8..82a56d9c2be1 100644 --- a/drivers/ras/amd/atl/internal.h +++ b/drivers/ras/amd/atl/internal.h @@ -17,7 +17,8 @@ #include <linux/bitops.h> #include <linux/ras.h> -#include <asm/amd_nb.h> +#include <asm/amd/nb.h> +#include <asm/amd/node.h> #include "reg_fields.h" @@ -137,7 +138,8 @@ struct df_flags { __u8 legacy_ficaa : 1, socket_id_shift_quirk : 1, heterogeneous : 1, - __reserved_0 : 5; + prm_only : 1, + __reserved_0 : 4; }; struct df_config { @@ -282,6 +284,9 @@ unsigned long convert_umc_mca_addr_to_sys_addr(struct atl_err *err); u64 add_base_and_hole(struct addr_ctx *ctx, u64 addr); u64 remove_base_and_hole(struct addr_ctx *ctx, u64 addr); +/* GUIDs for PRM handlers */ +extern const guid_t norm_to_sys_guid; + #ifdef CONFIG_AMD_ATL_PRM unsigned long prm_umc_norm_to_sys_addr(u8 socket_id, u64 umc_bank_inst_id, unsigned long addr); #else @@ -361,4 +366,7 @@ static inline void atl_debug_on_bad_intlv_mode(struct addr_ctx *ctx) atl_debug(ctx, "Unrecognized interleave mode: %u", ctx->map.intlv_mode); } +#define MI300_UMC_MCA_COL GENMASK(5, 1) +#define MI300_UMC_MCA_ROW13 BIT(23) + #endif /* __AMD_ATL_INTERNAL_H__ */ diff --git a/drivers/ras/amd/atl/prm.c b/drivers/ras/amd/atl/prm.c index 0931a20d213b..0f9bfa96e16a 100644 --- a/drivers/ras/amd/atl/prm.c +++ b/drivers/ras/amd/atl/prm.c @@ -29,10 +29,6 @@ struct norm_to_sys_param_buf { void *out_buf; } __packed; -static const guid_t norm_to_sys_guid = GUID_INIT(0xE7180659, 0xA65D, 0x451D, - 0x92, 0xCD, 0x2B, 0x56, 0xF1, - 0x2B, 0xEB, 0xA6); - unsigned long prm_umc_norm_to_sys_addr(u8 socket_id, u64 bank_id, unsigned long addr) { struct norm_to_sys_param_buf p_buf; diff --git a/drivers/ras/amd/atl/system.c b/drivers/ras/amd/atl/system.c index e18d916d5e8b..812a30e21d3a 100644 --- a/drivers/ras/amd/atl/system.c +++ b/drivers/ras/amd/atl/system.c @@ -12,6 +12,12 @@ #include "internal.h" +#include <linux/prmt.h> + +const guid_t norm_to_sys_guid = GUID_INIT(0xE7180659, 0xA65D, 0x451D, + 0x92, 0xCD, 0x2B, 0x56, 0xF1, + 0x2B, 0xEB, 0xA6); + int determine_node_id(struct addr_ctx *ctx, u8 socket_id, u8 die_id) { u16 socket_id_bits, die_id_bits; @@ -212,15 +218,17 @@ static int determine_df_rev(void) if (!rev) return determine_df_rev_legacy(); - /* - * Fail out for major revisions other than '4'. - * - * Explicit support should be added for newer systems to avoid issues. - */ if (rev == 4) return df4_determine_df_rev(reg); - return -EINVAL; + /* All other systems should have PRM handlers. */ + if (!acpi_prm_handler_available(&norm_to_sys_guid)) { + pr_debug("PRM not available\n"); + return -ENODEV; + } + + df_cfg.flags.prm_only = true; + return 0; } static int get_dram_hole_base(void) @@ -288,12 +296,18 @@ static void dump_df_cfg(void) int get_df_system_info(void) { - if (determine_df_rev()) { + int ret; + + ret = determine_df_rev(); + if (ret) { pr_warn("Failed to determine DF Revision"); df_cfg.rev = UNKNOWN; - return -EINVAL; + return ret; } + if (df_cfg.flags.prm_only) + return 0; + apply_node_id_shift(); get_num_maps(); diff --git a/drivers/ras/amd/atl/umc.c b/drivers/ras/amd/atl/umc.c index dc8aa12f63c8..befc616d5e8a 100644 --- a/drivers/ras/amd/atl/umc.c +++ b/drivers/ras/amd/atl/umc.c @@ -49,17 +49,6 @@ static u8 get_coh_st_inst_id_mi300(struct atl_err *err) return i; } -/* XOR the bits in @val. */ -static u16 bitwise_xor_bits(u16 val) -{ - u16 tmp = 0; - u8 i; - - for (i = 0; i < 16; i++) - tmp ^= (val >> i) & 0x1; - - return tmp; -} struct xor_bits { bool xor_enable; @@ -229,7 +218,6 @@ int get_umc_info_mi300(void) * Additionally, the PC and Bank bits may be hashed. This must be accounted for before * reconstructing the normalized address. */ -#define MI300_UMC_MCA_COL GENMASK(5, 1) #define MI300_UMC_MCA_BANK GENMASK(9, 6) #define MI300_UMC_MCA_ROW GENMASK(24, 10) #define MI300_UMC_MCA_PC BIT(25) @@ -251,17 +239,17 @@ static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr) if (!addr_hash.bank[i].xor_enable) continue; - temp = bitwise_xor_bits(col & addr_hash.bank[i].col_xor); - temp ^= bitwise_xor_bits(row & addr_hash.bank[i].row_xor); + temp = hweight16(col & addr_hash.bank[i].col_xor) & 1; + temp ^= hweight16(row & addr_hash.bank[i].row_xor) & 1; bank ^= temp << i; } /* Calculate hash for PC bit. */ if (addr_hash.pc.xor_enable) { - temp = bitwise_xor_bits(col & addr_hash.pc.col_xor); - temp ^= bitwise_xor_bits(row & addr_hash.pc.row_xor); + temp = hweight16(col & addr_hash.pc.col_xor) & 1; + temp ^= hweight16(row & addr_hash.pc.row_xor) & 1; /* Bits SID[1:0] act as Bank[5:4] for PC hash, so apply them here. */ - temp ^= bitwise_xor_bits((bank | sid << NUM_BANK_BITS) & addr_hash.bank_xor); + temp ^= hweight16((bank | sid << NUM_BANK_BITS) & addr_hash.bank_xor) & 1; pc ^= temp; } @@ -320,7 +308,7 @@ static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr) * See amd_atl::convert_dram_to_norm_addr_mi300() for MI300 address formats. */ #define MI300_NUM_COL BIT(HWEIGHT(MI300_UMC_MCA_COL)) -static void retire_row_mi300(struct atl_err *a_err) +static void _retire_row_mi300(struct atl_err *a_err) { unsigned long addr; struct page *p; @@ -351,6 +339,22 @@ static void retire_row_mi300(struct atl_err *a_err) } } +/* + * In addition to the column bits, the row[13] bit should also be included when + * calculating addresses affected by a physical row. + * + * Instead of running through another loop over a single bit, just run through + * the column bits twice and flip the row[13] bit in-between. + * + * See MI300_UMC_MCA_ROW for the row bits in MCA_ADDR_UMC value. + */ +static void retire_row_mi300(struct atl_err *a_err) +{ + _retire_row_mi300(a_err); + a_err->addr ^= MI300_UMC_MCA_ROW13; + _retire_row_mi300(a_err); +} + void amd_retire_dram_row(struct atl_err *a_err) { if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous) @@ -407,7 +411,7 @@ unsigned long convert_umc_mca_addr_to_sys_addr(struct atl_err *err) socket_id, die_id, coh_st_inst_id, addr); ret_addr = prm_umc_norm_to_sys_addr(socket_id, err->ipid, addr); - if (!IS_ERR_VALUE(ret_addr)) + if (!IS_ERR_VALUE(ret_addr) || df_cfg.flags.prm_only) return ret_addr; return norm_to_sys_addr(socket_id, die_id, coh_st_inst_id, addr); diff --git a/drivers/ras/amd/fmpm.c b/drivers/ras/amd/fmpm.c index 90de737fbc90..8877c6ff64c4 100644 --- a/drivers/ras/amd/fmpm.c +++ b/drivers/ras/amd/fmpm.c @@ -250,6 +250,13 @@ static bool rec_has_valid_entries(struct fru_rec *rec) return true; } +/* + * Row retirement is done on MI300 systems, and some bits are 'don't + * care' for comparing addresses with unique physical rows. This + * includes all column bits and the row[13] bit. + */ +#define MASK_ADDR(addr) ((addr) & ~(MI300_UMC_MCA_ROW13 | MI300_UMC_MCA_COL)) + static bool fpds_equal(struct cper_fru_poison_desc *old, struct cper_fru_poison_desc *new) { /* @@ -258,7 +265,7 @@ static bool fpds_equal(struct cper_fru_poison_desc *old, struct cper_fru_poison_ * * Also, order the checks from most->least likely to fail to shortcut the code. */ - if (old->addr != new->addr) + if (MASK_ADDR(old->addr) != MASK_ADDR(new->addr)) return false; if (old->hw_id != new->hw_id) diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c index e440b15fbabc..15f7f043c8ef 100644 --- a/drivers/ras/cec.c +++ b/drivers/ras/cec.c @@ -166,7 +166,7 @@ static void cec_mod_work(unsigned long interval) unsigned long iv; iv = interval * HZ; - mod_delayed_work(system_wq, &cec_work, round_jiffies(iv)); + mod_delayed_work(system_percpu_wq, &cec_work, round_jiffies(iv)); } static void cec_work_fn(struct work_struct *work) diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c index a6e4792a1b2e..2a5b5a9fdcb3 100644 --- a/drivers/ras/ras.c +++ b/drivers/ras/ras.c @@ -51,10 +51,47 @@ void log_non_standard_event(const guid_t *sec_type, const guid_t *fru_id, { trace_non_standard_event(sec_type, fru_id, fru_text, sev, err, len); } +EXPORT_SYMBOL_GPL(log_non_standard_event); -void log_arm_hw_error(struct cper_sec_proc_arm *err) +void log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev) { - trace_arm_event(err); + struct cper_arm_err_info *err_info; + struct cper_arm_ctx_info *ctx_info; + u8 *ven_err_data; + u32 ctx_len = 0; + int n, sz, cpu; + s32 vsei_len; + u32 pei_len; + u8 *pei_err, *ctx_err; + + pei_len = sizeof(struct cper_arm_err_info) * err->err_info_num; + pei_err = (u8 *)(err + 1); + + err_info = (struct cper_arm_err_info *)(err + 1); + ctx_info = (struct cper_arm_ctx_info *)(err_info + err->err_info_num); + ctx_err = (u8 *)ctx_info; + + for (n = 0; n < err->context_info_num; n++) { + sz = sizeof(struct cper_arm_ctx_info) + ctx_info->size; + ctx_info = (struct cper_arm_ctx_info *)((long)ctx_info + sz); + ctx_len += sz; + } + + vsei_len = err->section_length - (sizeof(struct cper_sec_proc_arm) + pei_len + ctx_len); + if (vsei_len < 0) { + pr_warn(FW_BUG "section length: %d\n", err->section_length); + pr_warn(FW_BUG "section length is too small\n"); + pr_warn(FW_BUG "firmware-generated error record is incorrect\n"); + vsei_len = 0; + } + ven_err_data = (u8 *)ctx_info; + + cpu = GET_LOGICAL_INDEX(err->mpidr); + if (cpu < 0) + cpu = -1; + + trace_arm_event(err, pei_err, pei_len, ctx_err, ctx_len, + ven_err_data, (u32)vsei_len, sev, cpu); } static int __init ras_init(void) |
