diff options
| -rw-r--r-- | arch/x86/include/asm/acpi.h | 11 | ||||
| -rw-r--r-- | arch/x86/include/asm/mce.h | 9 | ||||
| -rw-r--r-- | arch/x86/include/asm/msr-index.h | 1 | ||||
| -rw-r--r-- | arch/x86/kernel/acpi/apei.c | 5 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/mce/apei.c | 61 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/mce/core.c | 43 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/mce/intel.c | 21 | ||||
| -rw-r--r-- | drivers/firmware/efi/cper-x86.c | 11 | 
8 files changed, 131 insertions, 31 deletions
| diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 6d2df1ee427b..65064d9f7fa6 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -159,6 +159,8 @@ static inline u64 x86_default_get_root_pointer(void)  extern int x86_acpi_numa_init(void);  #endif /* CONFIG_ACPI_NUMA */ +struct cper_ia_proc_ctx; +  #ifdef CONFIG_ACPI_APEI  static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr)  { @@ -177,6 +179,15 @@ static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr)  	 */  	return PAGE_KERNEL_NOENC;  } + +int arch_apei_report_x86_error(struct cper_ia_proc_ctx *ctx_info, +			       u64 lapic_id); +#else +static inline int arch_apei_report_x86_error(struct cper_ia_proc_ctx *ctx_info, +					     u64 lapic_id) +{ +	return -EINVAL; +}  #endif  #define ACPI_TABLE_UPGRADE_MAX_PHYS (max_low_pfn_mapped << PAGE_SHIFT) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index a0f147893a04..56cdeaac76a0 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -177,7 +177,8 @@ enum mce_notifier_prios {  	MCE_PRIO_EXTLOG,  	MCE_PRIO_UC,  	MCE_PRIO_EARLY, -	MCE_PRIO_CEC +	MCE_PRIO_CEC, +	MCE_PRIO_HIGHEST = MCE_PRIO_CEC  };  struct notifier_block; @@ -198,16 +199,22 @@ static inline void enable_copy_mc_fragile(void)  }  #endif +struct cper_ia_proc_ctx; +  #ifdef CONFIG_X86_MCE  int mcheck_init(void);  void mcheck_cpu_init(struct cpuinfo_x86 *c);  void mcheck_cpu_clear(struct cpuinfo_x86 *c);  void mcheck_vendor_init_severity(void); +int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, +			       u64 lapic_id);  #else  static inline int mcheck_init(void) { return 0; }  static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {}  static inline void mcheck_cpu_clear(struct cpuinfo_x86 *c) {}  static inline void mcheck_vendor_init_severity(void) {} +static inline int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, +					     u64 lapic_id) { return -EINVAL; }  #endif  #ifdef CONFIG_X86_ANCIENT_MCE diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 972a34d93505..b2dd2648c0e2 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -139,6 +139,7 @@  #define MSR_IA32_MCG_CAP		0x00000179  #define MSR_IA32_MCG_STATUS		0x0000017a  #define MSR_IA32_MCG_CTL		0x0000017b +#define MSR_ERROR_CONTROL		0x0000017f  #define MSR_IA32_MCG_EXT_CTL		0x000004d0  #define MSR_OFFCORE_RSP_0		0x000001a6 diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c index c22fb55abcfd..0916f00a992e 100644 --- a/arch/x86/kernel/acpi/apei.c +++ b/arch/x86/kernel/acpi/apei.c @@ -43,3 +43,8 @@ void arch_apei_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)  	apei_mce_report_mem_error(sev, mem_err);  #endif  } + +int arch_apei_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) +{ +	return apei_smca_report_x86_error(ctx_info, lapic_id); +} diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c index af8d37962586..b58b85380ddb 100644 --- a/arch/x86/kernel/cpu/mce/apei.c +++ b/arch/x86/kernel/cpu/mce/apei.c @@ -51,6 +51,67 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)  }  EXPORT_SYMBOL_GPL(apei_mce_report_mem_error); +int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) +{ +	const u64 *i_mce = ((const u64 *) (ctx_info + 1)); +	unsigned int cpu; +	struct mce m; + +	if (!boot_cpu_has(X86_FEATURE_SMCA)) +		return -EINVAL; + +	/* +	 * The starting address of the register array extracted from BERT must +	 * match with the first expected register in the register layout of +	 * SMCA address space. This address corresponds to banks's MCA_STATUS +	 * register. +	 * +	 * Match any MCi_STATUS register by turning off bank numbers. +	 */ +	if ((ctx_info->msr_addr & MSR_AMD64_SMCA_MC0_STATUS) != +				  MSR_AMD64_SMCA_MC0_STATUS) +		return -EINVAL; + +	/* +	 * The register array size must be large enough to include all the +	 * SMCA registers which need to be extracted. +	 * +	 * The number of registers in the register array is determined by +	 * Register Array Size/8 as defined in UEFI spec v2.8, sec N.2.4.2.2. +	 * The register layout is fixed and currently the raw data in the +	 * register array includes 6 SMCA registers which the kernel can +	 * extract. +	 */ +	if (ctx_info->reg_arr_size < 48) +		return -EINVAL; + +	mce_setup(&m); + +	m.extcpu = -1; +	m.socketid = -1; + +	for_each_possible_cpu(cpu) { +		if (cpu_data(cpu).initial_apicid == lapic_id) { +			m.extcpu = cpu; +			m.socketid = cpu_data(m.extcpu).phys_proc_id; +			break; +		} +	} + +	m.apicid = lapic_id; +	m.bank = (ctx_info->msr_addr >> 4) & 0xFF; +	m.status = *i_mce; +	m.addr = *(i_mce + 1); +	m.misc = *(i_mce + 2); +	/* Skipping MCA_CONFIG */ +	m.ipid = *(i_mce + 4); +	m.synd = *(i_mce + 5); + +	mce_log(&m); + +	return 0; +} +  #define CPER_CREATOR_MCE						\  	GUID_INIT(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c,	\  		  0x64, 0x90, 0xb8, 0x9d) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 32b7099e3511..6af6a3c0698f 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -162,7 +162,8 @@ EXPORT_SYMBOL_GPL(mce_log);  void mce_register_decode_chain(struct notifier_block *nb)  { -	if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC)) +	if (WARN_ON(nb->priority < MCE_PRIO_LOWEST || +		    nb->priority > MCE_PRIO_HIGHEST))  		return;  	blocking_notifier_chain_register(&x86_mce_decoder_chain, nb); @@ -1265,14 +1266,14 @@ static void kill_me_maybe(struct callback_head *cb)  	}  } -static void queue_task_work(struct mce *m, int kill_it) +static void queue_task_work(struct mce *m, int kill_current_task)  {  	current->mce_addr = m->addr;  	current->mce_kflags = m->kflags;  	current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);  	current->mce_whole_page = whole_page(m); -	if (kill_it) +	if (kill_current_task)  		current->mce_kill_me.func = kill_me_now;  	else  		current->mce_kill_me.func = kill_me_maybe; @@ -1320,10 +1321,10 @@ noinstr void do_machine_check(struct pt_regs *regs)  	int no_way_out = 0;  	/* -	 * If kill_it gets set, there might be a way to recover from this +	 * If kill_current_task is not set, there might be a way to recover from this  	 * error.  	 */ -	int kill_it = 0; +	int kill_current_task = 0;  	/*  	 * MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES @@ -1350,8 +1351,7 @@ noinstr void do_machine_check(struct pt_regs *regs)  	 * severity is MCE_AR_SEVERITY we have other options.  	 */  	if (!(m.mcgstatus & MCG_STATUS_RIPV)) -		kill_it = 1; - +		kill_current_task = (cfg->tolerant == 3) ? 0 : 1;  	/*  	 * Check if this MCE is signaled to only this logical processor,  	 * on Intel, Zhaoxin only. @@ -1368,7 +1368,7 @@ noinstr void do_machine_check(struct pt_regs *regs)  	 * to see it will clear it.  	 */  	if (lmce) { -		if (no_way_out) +		if (no_way_out && cfg->tolerant < 3)  			mce_panic("Fatal local machine check", &m, msg);  	} else {  		order = mce_start(&no_way_out); @@ -1387,6 +1387,9 @@ noinstr void do_machine_check(struct pt_regs *regs)  		if (mce_end(order) < 0) {  			if (!no_way_out)  				no_way_out = worst >= MCE_PANIC_SEVERITY; + +			if (no_way_out && cfg->tolerant < 3) +				mce_panic("Fatal machine check on current CPU", &m, msg);  		}  	} else {  		/* @@ -1403,19 +1406,7 @@ noinstr void do_machine_check(struct pt_regs *regs)  		}  	} -	/* -	 * If tolerant is at an insane level we drop requests to kill -	 * processes and continue even when there is no way out. -	 */ -	if (cfg->tolerant == 3) -		kill_it = 0; -	else if (no_way_out) -		mce_panic("Fatal machine check on current CPU", &m, msg); - -	if (worst > 0) -		irq_work_queue(&mce_irq_work); - -	if (worst != MCE_AR_SEVERITY && !kill_it) +	if (worst != MCE_AR_SEVERITY && !kill_current_task)  		goto out;  	/* Fault was in user mode and we need to take some action */ @@ -1423,7 +1414,7 @@ noinstr void do_machine_check(struct pt_regs *regs)  		/* If this triggers there is no way to recover. Die hard. */  		BUG_ON(!on_thread_stack() || !user_mode(regs)); -		queue_task_work(&m, kill_it); +		queue_task_work(&m, kill_current_task);  	} else {  		/* @@ -1441,7 +1432,7 @@ noinstr void do_machine_check(struct pt_regs *regs)  		}  		if (m.kflags & MCE_IN_KERNEL_COPYIN) -			queue_task_work(&m, kill_it); +			queue_task_work(&m, kill_current_task);  	}  out:  	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); @@ -1583,7 +1574,7 @@ static void __mcheck_cpu_mce_banks_init(void)  		 * __mcheck_cpu_init_clear_banks() does the final bank setup.  		 */  		b->ctl = -1ULL; -		b->init = 1; +		b->init = true;  	}  } @@ -1764,7 +1755,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)  		 */  		if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0) -			mce_banks[0].init = 0; +			mce_banks[0].init = false;  		/*  		 * All newer Intel systems support MCE broadcasting. Enable @@ -1813,11 +1804,9 @@ static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)  	case X86_VENDOR_INTEL:  		intel_p5_mcheck_init(c);  		return 1; -		break;  	case X86_VENDOR_CENTAUR:  		winchip_mcheck_init(c);  		return 1; -		break;  	default:  		return 0;  	} diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index abe9fe0fb851..c2476fe0682e 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -509,12 +509,33 @@ static void intel_ppin_init(struct cpuinfo_x86 *c)  	}  } +/* + * Enable additional error logs from the integrated + * memory controller on processors that support this. + */ +static void intel_imc_init(struct cpuinfo_x86 *c) +{ +	u64 error_control; + +	switch (c->x86_model) { +	case INTEL_FAM6_SANDYBRIDGE_X: +	case INTEL_FAM6_IVYBRIDGE_X: +	case INTEL_FAM6_HASWELL_X: +		if (rdmsrl_safe(MSR_ERROR_CONTROL, &error_control)) +			return; +		error_control |= 2; +		wrmsrl_safe(MSR_ERROR_CONTROL, error_control); +		break; +	} +} +  void mce_intel_feature_init(struct cpuinfo_x86 *c)  {  	intel_init_thermal(c);  	intel_init_cmci();  	intel_init_lmce();  	intel_ppin_init(c); +	intel_imc_init(c);  }  void mce_intel_feature_clear(struct cpuinfo_x86 *c) diff --git a/drivers/firmware/efi/cper-x86.c b/drivers/firmware/efi/cper-x86.c index 2531de49f56c..438ed9eff6d0 100644 --- a/drivers/firmware/efi/cper-x86.c +++ b/drivers/firmware/efi/cper-x86.c @@ -2,6 +2,7 @@  // Copyright (C) 2018, Advanced Micro Devices, Inc.  #include <linux/cper.h> +#include <linux/acpi.h>  /*   * We don't need a "CPER_IA" prefix since these are all locally defined. @@ -347,9 +348,13 @@ void cper_print_proc_ia(const char *pfx, const struct cper_sec_proc_ia *proc)  			       ctx_info->mm_reg_addr);  		} -		printk("%sRegister Array:\n", newpfx); -		print_hex_dump(newpfx, "", DUMP_PREFIX_OFFSET, 16, groupsize, -			       (ctx_info + 1), ctx_info->reg_arr_size, 0); +		if (ctx_info->reg_ctx_type != CTX_TYPE_MSR || +		    arch_apei_report_x86_error(ctx_info, proc->lapic_id)) { +			printk("%sRegister Array:\n", newpfx); +			print_hex_dump(newpfx, "", DUMP_PREFIX_OFFSET, 16, +				       groupsize, (ctx_info + 1), +				       ctx_info->reg_arr_size, 0); +		}  		ctx_info = (struct cper_ia_proc_ctx *)((long)ctx_info + size);  	} | 
