summaryrefslogtreecommitdiff
path: root/drivers/ras
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/ras')
-rw-r--r--drivers/ras/amd/atl/Kconfig1
-rw-r--r--drivers/ras/amd/atl/core.c7
-rw-r--r--drivers/ras/amd/atl/internal.h12
-rw-r--r--drivers/ras/amd/atl/prm.c4
-rw-r--r--drivers/ras/amd/atl/system.c30
-rw-r--r--drivers/ras/amd/atl/umc.c42
-rw-r--r--drivers/ras/amd/fmpm.c9
-rw-r--r--drivers/ras/cec.c2
-rw-r--r--drivers/ras/ras.c41
9 files changed, 109 insertions, 39 deletions
diff --git a/drivers/ras/amd/atl/Kconfig b/drivers/ras/amd/atl/Kconfig
index 551680073e43..6e03942cd7da 100644
--- a/drivers/ras/amd/atl/Kconfig
+++ b/drivers/ras/amd/atl/Kconfig
@@ -10,6 +10,7 @@
config AMD_ATL
tristate "AMD Address Translation Library"
depends on AMD_NB && X86_64 && RAS
+ depends on AMD_NODE
depends on MEMORY_FAILURE
default N
help
diff --git a/drivers/ras/amd/atl/core.c b/drivers/ras/amd/atl/core.c
index 4197e10993ac..0f7cd6dab0b0 100644
--- a/drivers/ras/amd/atl/core.c
+++ b/drivers/ras/amd/atl/core.c
@@ -194,6 +194,8 @@ MODULE_DEVICE_TABLE(x86cpu, amd_atl_cpuids);
static int __init amd_atl_init(void)
{
+ int ret;
+
if (!x86_match_cpu(amd_atl_cpuids))
return -ENODEV;
@@ -202,8 +204,9 @@ static int __init amd_atl_init(void)
check_for_legacy_df_access();
- if (get_df_system_info())
- return -ENODEV;
+ ret = get_df_system_info();
+ if (ret)
+ return ret;
/* Increment this module's recount so that it can't be easily unloaded. */
__module_get(THIS_MODULE);
diff --git a/drivers/ras/amd/atl/internal.h b/drivers/ras/amd/atl/internal.h
index 143d04c779a8..82a56d9c2be1 100644
--- a/drivers/ras/amd/atl/internal.h
+++ b/drivers/ras/amd/atl/internal.h
@@ -17,7 +17,8 @@
#include <linux/bitops.h>
#include <linux/ras.h>
-#include <asm/amd_nb.h>
+#include <asm/amd/nb.h>
+#include <asm/amd/node.h>
#include "reg_fields.h"
@@ -137,7 +138,8 @@ struct df_flags {
__u8 legacy_ficaa : 1,
socket_id_shift_quirk : 1,
heterogeneous : 1,
- __reserved_0 : 5;
+ prm_only : 1,
+ __reserved_0 : 4;
};
struct df_config {
@@ -282,6 +284,9 @@ unsigned long convert_umc_mca_addr_to_sys_addr(struct atl_err *err);
u64 add_base_and_hole(struct addr_ctx *ctx, u64 addr);
u64 remove_base_and_hole(struct addr_ctx *ctx, u64 addr);
+/* GUIDs for PRM handlers */
+extern const guid_t norm_to_sys_guid;
+
#ifdef CONFIG_AMD_ATL_PRM
unsigned long prm_umc_norm_to_sys_addr(u8 socket_id, u64 umc_bank_inst_id, unsigned long addr);
#else
@@ -361,4 +366,7 @@ static inline void atl_debug_on_bad_intlv_mode(struct addr_ctx *ctx)
atl_debug(ctx, "Unrecognized interleave mode: %u", ctx->map.intlv_mode);
}
+#define MI300_UMC_MCA_COL GENMASK(5, 1)
+#define MI300_UMC_MCA_ROW13 BIT(23)
+
#endif /* __AMD_ATL_INTERNAL_H__ */
diff --git a/drivers/ras/amd/atl/prm.c b/drivers/ras/amd/atl/prm.c
index 0931a20d213b..0f9bfa96e16a 100644
--- a/drivers/ras/amd/atl/prm.c
+++ b/drivers/ras/amd/atl/prm.c
@@ -29,10 +29,6 @@ struct norm_to_sys_param_buf {
void *out_buf;
} __packed;
-static const guid_t norm_to_sys_guid = GUID_INIT(0xE7180659, 0xA65D, 0x451D,
- 0x92, 0xCD, 0x2B, 0x56, 0xF1,
- 0x2B, 0xEB, 0xA6);
-
unsigned long prm_umc_norm_to_sys_addr(u8 socket_id, u64 bank_id, unsigned long addr)
{
struct norm_to_sys_param_buf p_buf;
diff --git a/drivers/ras/amd/atl/system.c b/drivers/ras/amd/atl/system.c
index e18d916d5e8b..812a30e21d3a 100644
--- a/drivers/ras/amd/atl/system.c
+++ b/drivers/ras/amd/atl/system.c
@@ -12,6 +12,12 @@
#include "internal.h"
+#include <linux/prmt.h>
+
+const guid_t norm_to_sys_guid = GUID_INIT(0xE7180659, 0xA65D, 0x451D,
+ 0x92, 0xCD, 0x2B, 0x56, 0xF1,
+ 0x2B, 0xEB, 0xA6);
+
int determine_node_id(struct addr_ctx *ctx, u8 socket_id, u8 die_id)
{
u16 socket_id_bits, die_id_bits;
@@ -212,15 +218,17 @@ static int determine_df_rev(void)
if (!rev)
return determine_df_rev_legacy();
- /*
- * Fail out for major revisions other than '4'.
- *
- * Explicit support should be added for newer systems to avoid issues.
- */
if (rev == 4)
return df4_determine_df_rev(reg);
- return -EINVAL;
+ /* All other systems should have PRM handlers. */
+ if (!acpi_prm_handler_available(&norm_to_sys_guid)) {
+ pr_debug("PRM not available\n");
+ return -ENODEV;
+ }
+
+ df_cfg.flags.prm_only = true;
+ return 0;
}
static int get_dram_hole_base(void)
@@ -288,12 +296,18 @@ static void dump_df_cfg(void)
int get_df_system_info(void)
{
- if (determine_df_rev()) {
+ int ret;
+
+ ret = determine_df_rev();
+ if (ret) {
pr_warn("Failed to determine DF Revision");
df_cfg.rev = UNKNOWN;
- return -EINVAL;
+ return ret;
}
+ if (df_cfg.flags.prm_only)
+ return 0;
+
apply_node_id_shift();
get_num_maps();
diff --git a/drivers/ras/amd/atl/umc.c b/drivers/ras/amd/atl/umc.c
index dc8aa12f63c8..befc616d5e8a 100644
--- a/drivers/ras/amd/atl/umc.c
+++ b/drivers/ras/amd/atl/umc.c
@@ -49,17 +49,6 @@ static u8 get_coh_st_inst_id_mi300(struct atl_err *err)
return i;
}
-/* XOR the bits in @val. */
-static u16 bitwise_xor_bits(u16 val)
-{
- u16 tmp = 0;
- u8 i;
-
- for (i = 0; i < 16; i++)
- tmp ^= (val >> i) & 0x1;
-
- return tmp;
-}
struct xor_bits {
bool xor_enable;
@@ -229,7 +218,6 @@ int get_umc_info_mi300(void)
* Additionally, the PC and Bank bits may be hashed. This must be accounted for before
* reconstructing the normalized address.
*/
-#define MI300_UMC_MCA_COL GENMASK(5, 1)
#define MI300_UMC_MCA_BANK GENMASK(9, 6)
#define MI300_UMC_MCA_ROW GENMASK(24, 10)
#define MI300_UMC_MCA_PC BIT(25)
@@ -251,17 +239,17 @@ static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr)
if (!addr_hash.bank[i].xor_enable)
continue;
- temp = bitwise_xor_bits(col & addr_hash.bank[i].col_xor);
- temp ^= bitwise_xor_bits(row & addr_hash.bank[i].row_xor);
+ temp = hweight16(col & addr_hash.bank[i].col_xor) & 1;
+ temp ^= hweight16(row & addr_hash.bank[i].row_xor) & 1;
bank ^= temp << i;
}
/* Calculate hash for PC bit. */
if (addr_hash.pc.xor_enable) {
- temp = bitwise_xor_bits(col & addr_hash.pc.col_xor);
- temp ^= bitwise_xor_bits(row & addr_hash.pc.row_xor);
+ temp = hweight16(col & addr_hash.pc.col_xor) & 1;
+ temp ^= hweight16(row & addr_hash.pc.row_xor) & 1;
/* Bits SID[1:0] act as Bank[5:4] for PC hash, so apply them here. */
- temp ^= bitwise_xor_bits((bank | sid << NUM_BANK_BITS) & addr_hash.bank_xor);
+ temp ^= hweight16((bank | sid << NUM_BANK_BITS) & addr_hash.bank_xor) & 1;
pc ^= temp;
}
@@ -320,7 +308,7 @@ static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr)
* See amd_atl::convert_dram_to_norm_addr_mi300() for MI300 address formats.
*/
#define MI300_NUM_COL BIT(HWEIGHT(MI300_UMC_MCA_COL))
-static void retire_row_mi300(struct atl_err *a_err)
+static void _retire_row_mi300(struct atl_err *a_err)
{
unsigned long addr;
struct page *p;
@@ -351,6 +339,22 @@ static void retire_row_mi300(struct atl_err *a_err)
}
}
+/*
+ * In addition to the column bits, the row[13] bit should also be included when
+ * calculating addresses affected by a physical row.
+ *
+ * Instead of running through another loop over a single bit, just run through
+ * the column bits twice and flip the row[13] bit in-between.
+ *
+ * See MI300_UMC_MCA_ROW for the row bits in MCA_ADDR_UMC value.
+ */
+static void retire_row_mi300(struct atl_err *a_err)
+{
+ _retire_row_mi300(a_err);
+ a_err->addr ^= MI300_UMC_MCA_ROW13;
+ _retire_row_mi300(a_err);
+}
+
void amd_retire_dram_row(struct atl_err *a_err)
{
if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous)
@@ -407,7 +411,7 @@ unsigned long convert_umc_mca_addr_to_sys_addr(struct atl_err *err)
socket_id, die_id, coh_st_inst_id, addr);
ret_addr = prm_umc_norm_to_sys_addr(socket_id, err->ipid, addr);
- if (!IS_ERR_VALUE(ret_addr))
+ if (!IS_ERR_VALUE(ret_addr) || df_cfg.flags.prm_only)
return ret_addr;
return norm_to_sys_addr(socket_id, die_id, coh_st_inst_id, addr);
diff --git a/drivers/ras/amd/fmpm.c b/drivers/ras/amd/fmpm.c
index 90de737fbc90..8877c6ff64c4 100644
--- a/drivers/ras/amd/fmpm.c
+++ b/drivers/ras/amd/fmpm.c
@@ -250,6 +250,13 @@ static bool rec_has_valid_entries(struct fru_rec *rec)
return true;
}
+/*
+ * Row retirement is done on MI300 systems, and some bits are 'don't
+ * care' for comparing addresses with unique physical rows. This
+ * includes all column bits and the row[13] bit.
+ */
+#define MASK_ADDR(addr) ((addr) & ~(MI300_UMC_MCA_ROW13 | MI300_UMC_MCA_COL))
+
static bool fpds_equal(struct cper_fru_poison_desc *old, struct cper_fru_poison_desc *new)
{
/*
@@ -258,7 +265,7 @@ static bool fpds_equal(struct cper_fru_poison_desc *old, struct cper_fru_poison_
*
* Also, order the checks from most->least likely to fail to shortcut the code.
*/
- if (old->addr != new->addr)
+ if (MASK_ADDR(old->addr) != MASK_ADDR(new->addr))
return false;
if (old->hw_id != new->hw_id)
diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index e440b15fbabc..15f7f043c8ef 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -166,7 +166,7 @@ static void cec_mod_work(unsigned long interval)
unsigned long iv;
iv = interval * HZ;
- mod_delayed_work(system_wq, &cec_work, round_jiffies(iv));
+ mod_delayed_work(system_percpu_wq, &cec_work, round_jiffies(iv));
}
static void cec_work_fn(struct work_struct *work)
diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c
index a6e4792a1b2e..2a5b5a9fdcb3 100644
--- a/drivers/ras/ras.c
+++ b/drivers/ras/ras.c
@@ -51,10 +51,47 @@ void log_non_standard_event(const guid_t *sec_type, const guid_t *fru_id,
{
trace_non_standard_event(sec_type, fru_id, fru_text, sev, err, len);
}
+EXPORT_SYMBOL_GPL(log_non_standard_event);
-void log_arm_hw_error(struct cper_sec_proc_arm *err)
+void log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev)
{
- trace_arm_event(err);
+ struct cper_arm_err_info *err_info;
+ struct cper_arm_ctx_info *ctx_info;
+ u8 *ven_err_data;
+ u32 ctx_len = 0;
+ int n, sz, cpu;
+ s32 vsei_len;
+ u32 pei_len;
+ u8 *pei_err, *ctx_err;
+
+ pei_len = sizeof(struct cper_arm_err_info) * err->err_info_num;
+ pei_err = (u8 *)(err + 1);
+
+ err_info = (struct cper_arm_err_info *)(err + 1);
+ ctx_info = (struct cper_arm_ctx_info *)(err_info + err->err_info_num);
+ ctx_err = (u8 *)ctx_info;
+
+ for (n = 0; n < err->context_info_num; n++) {
+ sz = sizeof(struct cper_arm_ctx_info) + ctx_info->size;
+ ctx_info = (struct cper_arm_ctx_info *)((long)ctx_info + sz);
+ ctx_len += sz;
+ }
+
+ vsei_len = err->section_length - (sizeof(struct cper_sec_proc_arm) + pei_len + ctx_len);
+ if (vsei_len < 0) {
+ pr_warn(FW_BUG "section length: %d\n", err->section_length);
+ pr_warn(FW_BUG "section length is too small\n");
+ pr_warn(FW_BUG "firmware-generated error record is incorrect\n");
+ vsei_len = 0;
+ }
+ ven_err_data = (u8 *)ctx_info;
+
+ cpu = GET_LOGICAL_INDEX(err->mpidr);
+ if (cpu < 0)
+ cpu = -1;
+
+ trace_arm_event(err, pei_err, pei_len, ctx_err, ctx_len,
+ ven_err_data, (u32)vsei_len, sev, cpu);
}
static int __init ras_init(void)