diff options
Diffstat (limited to 'drivers/edac/ghes_edac.c')
| -rw-r--r-- | drivers/edac/ghes_edac.c | 771 |
1 files changed, 404 insertions, 367 deletions
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c index bb534670ec02..d80c88818691 100644 --- a/drivers/edac/ghes_edac.c +++ b/drivers/edac/ghes_edac.c @@ -1,12 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * GHES/EDAC Linux driver * - * This file may be distributed under the terms of the GNU General Public - * License version 2. + * Copyright (c) 2013 by Mauro Carvalho Chehab * - * Copyright (c) 2013 by Mauro Carvalho Chehab <mchehab@redhat.com> - * - * Red Hat Inc. http://www.redhat.com + * Red Hat Inc. https://www.redhat.com */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -14,26 +12,52 @@ #include <acpi/ghes.h> #include <linux/edac.h> #include <linux/dmi.h> -#include "edac_core.h" +#include "edac_module.h" #include <ras/ras_event.h> +#include <linux/notifier.h> +#include <linux/string.h> -#define GHES_EDAC_REVISION " Ver: 1.0.0" +#define OTHER_DETAIL_LEN 400 -struct ghes_edac_pvt { - struct list_head list; - struct ghes *ghes; +struct ghes_pvt { struct mem_ctl_info *mci; /* Buffers for the error handling routine */ - char detail_location[240]; - char other_detail[160]; + char other_detail[OTHER_DETAIL_LEN]; char msg[80]; }; -static LIST_HEAD(ghes_reglist); -static DEFINE_MUTEX(ghes_edac_lock); -static int ghes_edac_mc_num; +static refcount_t ghes_refcount = REFCOUNT_INIT(0); + +/* + * Access to ghes_pvt must be protected by ghes_lock. The spinlock + * also provides the necessary (implicit) memory barrier for the SMP + * case to make the pointer visible on another CPU. + */ +static struct ghes_pvt *ghes_pvt; + +/* + * This driver's representation of the system hardware, as collected + * from DMI. + */ +static struct ghes_hw_desc { + int num_dimms; + struct dimm_info *dimms; +} ghes_hw; + +/* GHES registration mutex */ +static DEFINE_MUTEX(ghes_reg_mutex); + +/* + * Sync with other, potentially concurrent callers of + * ghes_edac_report_mem_error(). We don't know what the + * "inventive" firmware would do. + */ +static DEFINE_SPINLOCK(ghes_lock); + +static bool system_scanned; +static struct list_head *ghes_devs; /* Memory Device - Type 17 of SMBIOS spec */ struct memdev_dmi_entry { @@ -61,155 +85,221 @@ struct memdev_dmi_entry { u16 conf_mem_clk_speed; } __attribute__((__packed__)); -struct ghes_edac_dimm_fill { - struct mem_ctl_info *mci; - unsigned count; -}; +static struct dimm_info *find_dimm_by_handle(struct mem_ctl_info *mci, u16 handle) +{ + struct dimm_info *dimm; -char *memory_type[] = { - [MEM_EMPTY] = "EMPTY", - [MEM_RESERVED] = "RESERVED", - [MEM_UNKNOWN] = "UNKNOWN", - [MEM_FPM] = "FPM", - [MEM_EDO] = "EDO", - [MEM_BEDO] = "BEDO", - [MEM_SDR] = "SDR", - [MEM_RDR] = "RDR", - [MEM_DDR] = "DDR", - [MEM_RDDR] = "RDDR", - [MEM_RMBS] = "RMBS", - [MEM_DDR2] = "DDR2", - [MEM_FB_DDR2] = "FB_DDR2", - [MEM_RDDR2] = "RDDR2", - [MEM_XDR] = "XDR", - [MEM_DDR3] = "DDR3", - [MEM_RDDR3] = "RDDR3", -}; + mci_for_each_dimm(mci, dimm) { + if (dimm->smbios_handle == handle) + return dimm; + } + + return NULL; +} -static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg) +static void dimm_setup_label(struct dimm_info *dimm, u16 handle) { - int *num_dimm = arg; + const char *bank = NULL, *device = NULL; + + dmi_memdev_name(handle, &bank, &device); - if (dh->type == DMI_ENTRY_MEM_DEVICE) - (*num_dimm)++; + /* + * Set to a NULL string when both bank and device are zero. In this case, + * the label assigned by default will be preserved. + */ + snprintf(dimm->label, sizeof(dimm->label), "%s%s%s", + (bank && *bank) ? bank : "", + (bank && *bank && device && *device) ? " " : "", + (device && *device) ? device : ""); } -static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg) +static void assign_dmi_dimm_info(struct dimm_info *dimm, struct memdev_dmi_entry *entry) { - struct ghes_edac_dimm_fill *dimm_fill = arg; - struct mem_ctl_info *mci = dimm_fill->mci; - - if (dh->type == DMI_ENTRY_MEM_DEVICE) { - struct memdev_dmi_entry *entry = (struct memdev_dmi_entry *)dh; - struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, - mci->n_layers, - dimm_fill->count, 0, 0); - - if (entry->size == 0xffff) { - pr_info("Can't get DIMM%i size\n", - dimm_fill->count); - dimm->nr_pages = MiB_TO_PAGES(32);/* Unknown */ - } else if (entry->size == 0x7fff) { - dimm->nr_pages = MiB_TO_PAGES(entry->extended_size); - } else { - if (entry->size & 1 << 15) - dimm->nr_pages = MiB_TO_PAGES((entry->size & - 0x7fff) << 10); - else - dimm->nr_pages = MiB_TO_PAGES(entry->size); - } + u16 rdr_mask = BIT(7) | BIT(13); - switch (entry->memory_type) { - case 0x12: - if (entry->type_detail & 1 << 13) - dimm->mtype = MEM_RDDR; - else - dimm->mtype = MEM_DDR; - break; - case 0x13: - if (entry->type_detail & 1 << 13) - dimm->mtype = MEM_RDDR2; - else - dimm->mtype = MEM_DDR2; - break; - case 0x14: - dimm->mtype = MEM_FB_DDR2; - break; - case 0x18: - if (entry->type_detail & 1 << 13) - dimm->mtype = MEM_RDDR3; - else - dimm->mtype = MEM_DDR3; - break; - default: - if (entry->type_detail & 1 << 6) - dimm->mtype = MEM_RMBS; - else if ((entry->type_detail & ((1 << 7) | (1 << 13))) - == ((1 << 7) | (1 << 13))) - dimm->mtype = MEM_RDR; - else if (entry->type_detail & 1 << 7) - dimm->mtype = MEM_SDR; - else if (entry->type_detail & 1 << 9) - dimm->mtype = MEM_EDO; - else - dimm->mtype = MEM_UNKNOWN; - } + if (entry->size == 0xffff) { + pr_info("Can't get DIMM%i size\n", dimm->idx); + dimm->nr_pages = MiB_TO_PAGES(32);/* Unknown */ + } else if (entry->size == 0x7fff) { + dimm->nr_pages = MiB_TO_PAGES(entry->extended_size); + } else { + if (entry->size & BIT(15)) + dimm->nr_pages = MiB_TO_PAGES((entry->size & 0x7fff) << 10); + else + dimm->nr_pages = MiB_TO_PAGES(entry->size); + } - /* - * Actually, we can only detect if the memory has bits for - * checksum or not - */ - if (entry->total_width == entry->data_width) - dimm->edac_mode = EDAC_NONE; + switch (entry->memory_type) { + case 0x12: + if (entry->type_detail & BIT(13)) + dimm->mtype = MEM_RDDR; else - dimm->edac_mode = EDAC_SECDED; + dimm->mtype = MEM_DDR; + break; + case 0x13: + if (entry->type_detail & BIT(13)) + dimm->mtype = MEM_RDDR2; + else + dimm->mtype = MEM_DDR2; + break; + case 0x14: + dimm->mtype = MEM_FB_DDR2; + break; + case 0x18: + if (entry->type_detail & BIT(12)) + dimm->mtype = MEM_NVDIMM; + else if (entry->type_detail & BIT(13)) + dimm->mtype = MEM_RDDR3; + else + dimm->mtype = MEM_DDR3; + break; + case 0x1a: + if (entry->type_detail & BIT(12)) + dimm->mtype = MEM_NVDIMM; + else if (entry->type_detail & BIT(13)) + dimm->mtype = MEM_RDDR4; + else + dimm->mtype = MEM_DDR4; + break; + default: + if (entry->type_detail & BIT(6)) + dimm->mtype = MEM_RMBS; + else if ((entry->type_detail & rdr_mask) == rdr_mask) + dimm->mtype = MEM_RDR; + else if (entry->type_detail & BIT(7)) + dimm->mtype = MEM_SDR; + else if (entry->type_detail & BIT(9)) + dimm->mtype = MEM_EDO; + else + dimm->mtype = MEM_UNKNOWN; + } - dimm->dtype = DEV_UNKNOWN; - dimm->grain = 128; /* Likely, worse case */ - - /* - * FIXME: It shouldn't be hard to also fill the DIMM labels - */ - - if (dimm->nr_pages) { - edac_dbg(1, "DIMM%i: %s size = %d MB%s\n", - dimm_fill->count, memory_type[dimm->mtype], - PAGES_TO_MiB(dimm->nr_pages), - (dimm->edac_mode != EDAC_NONE) ? "(ECC)" : ""); - edac_dbg(2, "\ttype %d, detail 0x%02x, width %d(total %d)\n", - entry->memory_type, entry->type_detail, - entry->total_width, entry->data_width); + /* + * Actually, we can only detect if the memory has bits for + * checksum or not + */ + if (entry->total_width == entry->data_width) + dimm->edac_mode = EDAC_NONE; + else + dimm->edac_mode = EDAC_SECDED; + + dimm->dtype = DEV_UNKNOWN; + dimm->grain = 128; /* Likely, worse case */ + + dimm_setup_label(dimm, entry->handle); + + if (dimm->nr_pages) { + edac_dbg(1, "DIMM%i: %s size = %d MB%s\n", + dimm->idx, edac_mem_types[dimm->mtype], + PAGES_TO_MiB(dimm->nr_pages), + (dimm->edac_mode != EDAC_NONE) ? "(ECC)" : ""); + edac_dbg(2, "\ttype %d, detail 0x%02x, width %d(total %d)\n", + entry->memory_type, entry->type_detail, + entry->total_width, entry->data_width); + } + + dimm->smbios_handle = entry->handle; +} + +static void enumerate_dimms(const struct dmi_header *dh, void *arg) +{ + struct memdev_dmi_entry *entry = (struct memdev_dmi_entry *)dh; + struct ghes_hw_desc *hw = (struct ghes_hw_desc *)arg; + struct dimm_info *d; + + if (dh->type != DMI_ENTRY_MEM_DEVICE) + return; + + /* Enlarge the array with additional 16 */ + if (!hw->num_dimms || !(hw->num_dimms % 16)) { + struct dimm_info *new; + + new = krealloc_array(hw->dimms, hw->num_dimms + 16, + sizeof(struct dimm_info), GFP_KERNEL); + if (!new) { + WARN_ON_ONCE(1); + return; } - dimm_fill->count++; + hw->dimms = new; } + + d = &hw->dimms[hw->num_dimms]; + d->idx = hw->num_dimms; + + assign_dmi_dimm_info(d, entry); + + hw->num_dimms++; +} + +static void ghes_scan_system(void) +{ + if (system_scanned) + return; + + dmi_walk(enumerate_dimms, &ghes_hw); + + system_scanned = true; } -void ghes_edac_report_mem_error(struct ghes *ghes, int sev, - struct cper_sec_mem_err *mem_err) +static int print_mem_error_other_detail(const struct cper_sec_mem_err *mem, char *msg, + const char *location, unsigned int len) { - enum hw_event_mc_err_type type; + u32 n; + + if (!msg) + return 0; + + n = 0; + len -= 1; + + n += scnprintf(msg + n, len - n, "APEI location: %s ", location); + + if (!(mem->validation_bits & CPER_MEM_VALID_ERROR_STATUS)) + goto out; + + n += scnprintf(msg + n, len - n, "status(0x%016llx): ", mem->error_status); + n += scnprintf(msg + n, len - n, "%s ", cper_mem_err_status_str(mem->error_status)); + +out: + msg[n] = '\0'; + + return n; +} + +static int ghes_edac_report_mem_error(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cper_sec_mem_err *mem_err = (struct cper_sec_mem_err *)data; + struct cper_mem_err_compact cmem; struct edac_raw_error_desc *e; struct mem_ctl_info *mci; - struct ghes_edac_pvt *pvt = NULL; + unsigned long sev = val; + struct ghes_pvt *pvt; + unsigned long flags; char *p; - u8 grain_bits; - list_for_each_entry(pvt, &ghes_reglist, list) { - if (ghes == pvt->ghes) - break; - } - if (!pvt) { - pr_err("Internal error: Can't find EDAC structure\n"); - return; - } + /* + * We can do the locking below because GHES defers error processing + * from NMI to IRQ context. Whenever that changes, we'd at least + * know. + */ + if (WARN_ON_ONCE(in_nmi())) + return NOTIFY_OK; + + spin_lock_irqsave(&ghes_lock, flags); + + pvt = ghes_pvt; + if (!pvt) + goto unlock; + mci = pvt->mci; e = &mci->error_desc; /* Cleans the error report buffer */ memset(e, 0, sizeof (*e)); e->error_count = 1; - strcpy(e->label, "unknown label"); + e->grain = 1; e->msg = pvt->msg; e->other_detail = pvt->other_detail; e->top_layer = -1; @@ -220,17 +310,17 @@ void ghes_edac_report_mem_error(struct ghes *ghes, int sev, switch (sev) { case GHES_SEV_CORRECTED: - type = HW_EVENT_ERR_CORRECTED; + e->type = HW_EVENT_ERR_CORRECTED; break; case GHES_SEV_RECOVERABLE: - type = HW_EVENT_ERR_UNCORRECTED; + e->type = HW_EVENT_ERR_UNCORRECTED; break; case GHES_SEV_PANIC: - type = HW_EVENT_ERR_FATAL; + e->type = HW_EVENT_ERR_FATAL; break; default: case GHES_SEV_NO: - type = HW_EVENT_ERR_INFO; + e->type = HW_EVENT_ERR_INFO; } edac_dbg(1, "error validation_bits: 0x%08llx\n", @@ -238,266 +328,147 @@ void ghes_edac_report_mem_error(struct ghes *ghes, int sev, /* Error type, mapped on e->msg */ if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_TYPE) { + u8 etype = mem_err->error_type; + p = pvt->msg; - switch (mem_err->error_type) { - case 0: - p += sprintf(p, "Unknown"); - break; - case 1: - p += sprintf(p, "No error"); - break; - case 2: - p += sprintf(p, "Single-bit ECC"); - break; - case 3: - p += sprintf(p, "Multi-bit ECC"); - break; - case 4: - p += sprintf(p, "Single-symbol ChipKill ECC"); - break; - case 5: - p += sprintf(p, "Multi-symbol ChipKill ECC"); - break; - case 6: - p += sprintf(p, "Master abort"); - break; - case 7: - p += sprintf(p, "Target abort"); - break; - case 8: - p += sprintf(p, "Parity Error"); - break; - case 9: - p += sprintf(p, "Watchdog timeout"); - break; - case 10: - p += sprintf(p, "Invalid address"); - break; - case 11: - p += sprintf(p, "Mirror Broken"); - break; - case 12: - p += sprintf(p, "Memory Sparing"); - break; - case 13: - p += sprintf(p, "Scrub corrected error"); - break; - case 14: - p += sprintf(p, "Scrub uncorrected error"); - break; - case 15: - p += sprintf(p, "Physical Memory Map-out event"); - break; - default: - p += sprintf(p, "reserved error (%d)", - mem_err->error_type); - } + p += snprintf(p, sizeof(pvt->msg), "%s", cper_mem_err_type_str(etype)); } else { - strcpy(pvt->msg, "unknown error"); + strscpy(pvt->msg, "unknown error"); } /* Error address */ - if (mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS) { - e->page_frame_number = mem_err->physical_addr >> PAGE_SHIFT; - e->offset_in_page = mem_err->physical_addr & ~PAGE_MASK; + if (mem_err->validation_bits & CPER_MEM_VALID_PA) { + e->page_frame_number = PHYS_PFN(mem_err->physical_addr); + e->offset_in_page = offset_in_page(mem_err->physical_addr); } /* Error grain */ - if (mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS_MASK) { - e->grain = ~(mem_err->physical_addr_mask & ~PAGE_MASK); - } + if (mem_err->validation_bits & CPER_MEM_VALID_PA_MASK) + e->grain = ~mem_err->physical_addr_mask + 1; /* Memory error location, mapped on e->location */ p = e->location; - if (mem_err->validation_bits & CPER_MEM_VALID_NODE) - p += sprintf(p, "node:%d ", mem_err->node); - if (mem_err->validation_bits & CPER_MEM_VALID_CARD) - p += sprintf(p, "card:%d ", mem_err->card); - if (mem_err->validation_bits & CPER_MEM_VALID_MODULE) - p += sprintf(p, "module:%d ", mem_err->module); - if (mem_err->validation_bits & CPER_MEM_VALID_BANK) - p += sprintf(p, "bank:%d ", mem_err->bank); - if (mem_err->validation_bits & CPER_MEM_VALID_ROW) - p += sprintf(p, "row:%d ", mem_err->row); - if (mem_err->validation_bits & CPER_MEM_VALID_COLUMN) - p += sprintf(p, "col:%d ", mem_err->column); - if (mem_err->validation_bits & CPER_MEM_VALID_BIT_POSITION) - p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos); + cper_mem_err_pack(mem_err, &cmem); + p += cper_mem_err_location(&cmem, p); + + if (mem_err->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) { + struct dimm_info *dimm; + + p += cper_dimm_err_location(&cmem, p); + dimm = find_dimm_by_handle(mci, mem_err->mem_dev_handle); + if (dimm) { + e->top_layer = dimm->idx; + strscpy(e->label, dimm->label); + } + } if (p > e->location) *(p - 1) = '\0'; + if (!*e->label) + strscpy(e->label, "unknown memory"); + /* All other fields are mapped on e->other_detail */ p = pvt->other_detail; - if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_STATUS) { - u64 status = mem_err->error_status; - - p += sprintf(p, "status(0x%016llx): ", (long long)status); - switch ((status >> 8) & 0xff) { - case 1: - p += sprintf(p, "Error detected internal to the component "); - break; - case 16: - p += sprintf(p, "Error detected in the bus "); - break; - case 4: - p += sprintf(p, "Storage error in DRAM memory "); - break; - case 5: - p += sprintf(p, "Storage error in TLB "); - break; - case 6: - p += sprintf(p, "Storage error in cache "); - break; - case 7: - p += sprintf(p, "Error in one or more functional units "); - break; - case 8: - p += sprintf(p, "component failed self test "); - break; - case 9: - p += sprintf(p, "Overflow or undervalue of internal queue "); - break; - case 17: - p += sprintf(p, "Virtual address not found on IO-TLB or IO-PDIR "); - break; - case 18: - p += sprintf(p, "Improper access error "); - break; - case 19: - p += sprintf(p, "Access to a memory address which is not mapped to any component "); - break; - case 20: - p += sprintf(p, "Loss of Lockstep "); - break; - case 21: - p += sprintf(p, "Response not associated with a request "); - break; - case 22: - p += sprintf(p, "Bus parity error - must also set the A, C, or D Bits "); - break; - case 23: - p += sprintf(p, "Detection of a PATH_ERROR "); - break; - case 25: - p += sprintf(p, "Bus operation timeout "); - break; - case 26: - p += sprintf(p, "A read was issued to data that has been poisoned "); - break; - default: - p += sprintf(p, "reserved "); - break; - } - } - if (mem_err->validation_bits & CPER_MEM_VALID_REQUESTOR_ID) - p += sprintf(p, "requestorID: 0x%016llx ", - (long long)mem_err->requestor_id); - if (mem_err->validation_bits & CPER_MEM_VALID_RESPONDER_ID) - p += sprintf(p, "responderID: 0x%016llx ", - (long long)mem_err->responder_id); - if (mem_err->validation_bits & CPER_MEM_VALID_TARGET_ID) - p += sprintf(p, "targetID: 0x%016llx ", - (long long)mem_err->responder_id); + p += print_mem_error_other_detail(mem_err, p, e->location, OTHER_DETAIL_LEN); if (p > pvt->other_detail) *(p - 1) = '\0'; - /* Generate the trace event */ - grain_bits = fls_long(e->grain); - sprintf(pvt->detail_location, "APEI location: %s %s", - e->location, e->other_detail); - trace_mc_event(type, e->msg, e->label, e->error_count, - mci->mc_idx, e->top_layer, e->mid_layer, e->low_layer, - PAGES_TO_MiB(e->page_frame_number) | e->offset_in_page, - grain_bits, e->syndrome, pvt->detail_location); - - /* Report the error via EDAC API */ - edac_raw_mc_handle_error(type, mci, e); + edac_raw_mc_handle_error(e); + +unlock: + spin_unlock_irqrestore(&ghes_lock, flags); + + return NOTIFY_OK; } -EXPORT_SYMBOL_GPL(ghes_edac_report_mem_error); -int ghes_edac_register(struct ghes *ghes, struct device *dev) +static struct notifier_block ghes_edac_mem_err_nb = { + .notifier_call = ghes_edac_report_mem_error, + .priority = 0, +}; + +static int ghes_edac_register(struct device *dev) { bool fake = false; - int rc, num_dimm = 0; struct mem_ctl_info *mci; + struct ghes_pvt *pvt; struct edac_mc_layer layers[1]; - struct ghes_edac_pvt *pvt; - struct ghes_edac_dimm_fill dimm_fill; + unsigned long flags; + int rc = 0; + + /* finish another registration/unregistration instance first */ + mutex_lock(&ghes_reg_mutex); - /* Get the number of DIMMs */ - dmi_walk(ghes_edac_count_dimms, &num_dimm); + /* + * We have only one logical memory controller to which all DIMMs belong. + */ + if (refcount_inc_not_zero(&ghes_refcount)) + goto unlock; + + ghes_scan_system(); /* Check if we've got a bogus BIOS */ - if (num_dimm == 0) { + if (!ghes_hw.num_dimms) { fake = true; - num_dimm = 1; + ghes_hw.num_dimms = 1; } layers[0].type = EDAC_MC_LAYER_ALL_MEM; - layers[0].size = num_dimm; + layers[0].size = ghes_hw.num_dimms; layers[0].is_virt_csrow = true; - /* - * We need to serialize edac_mc_alloc() and edac_mc_add_mc(), - * to avoid duplicated memory controller numbers - */ - mutex_lock(&ghes_edac_lock); - mci = edac_mc_alloc(ghes_edac_mc_num, ARRAY_SIZE(layers), layers, - sizeof(*pvt)); + mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, sizeof(struct ghes_pvt)); if (!mci) { pr_info("Can't allocate memory for EDAC data\n"); - mutex_unlock(&ghes_edac_lock); - return -ENOMEM; + rc = -ENOMEM; + goto unlock; } - pvt = mci->pvt_info; - memset(pvt, 0, sizeof(*pvt)); - list_add_tail(&pvt->list, &ghes_reglist); - pvt->ghes = ghes; - pvt->mci = mci; - mci->pdev = dev; + pvt = mci->pvt_info; + pvt->mci = mci; + mci->pdev = dev; mci->mtype_cap = MEM_FLAG_EMPTY; mci->edac_ctl_cap = EDAC_FLAG_NONE; mci->edac_cap = EDAC_FLAG_NONE; mci->mod_name = "ghes_edac.c"; - mci->mod_ver = GHES_EDAC_REVISION; mci->ctl_name = "ghes_edac"; mci->dev_name = "ghes"; - if (!ghes_edac_mc_num) { - if (!fake) { - pr_info("This EDAC driver relies on BIOS to enumerate memory and get error reports.\n"); - pr_info("Unfortunately, not all BIOSes reflect the memory layout correctly.\n"); - pr_info("So, the end result of using this driver varies from vendor to vendor.\n"); - pr_info("If you find incorrect reports, please contact your hardware vendor\n"); - pr_info("to correct its BIOS.\n"); - pr_info("This system has %d DIMM sockets.\n", - num_dimm); - } else { - pr_info("This system has a very crappy BIOS: It doesn't even list the DIMMS.\n"); - pr_info("Its SMBIOS info is wrong. It is doubtful that the error report would\n"); - pr_info("work on such system. Use this driver with caution\n"); - } + if (fake) { + pr_info("This system has a very crappy BIOS: It doesn't even list the DIMMS.\n"); + pr_info("Its SMBIOS info is wrong. It is doubtful that the error report would\n"); + pr_info("work on such system. Use this driver with caution\n"); } + pr_info("This system has %d DIMM sockets.\n", ghes_hw.num_dimms); + if (!fake) { - /* - * Fill DIMM info from DMI for the memory controller #0 - * - * Keep it in blank for the other memory controllers, as - * there's no reliable way to properly credit each DIMM to - * the memory controller, as different BIOSes fill the - * DMI bank location fields on different ways - */ - if (!ghes_edac_mc_num) { - dimm_fill.count = 0; - dimm_fill.mci = mci; - dmi_walk(ghes_edac_dmidecode, &dimm_fill); + struct dimm_info *src, *dst; + int i = 0; + + mci_for_each_dimm(mci, dst) { + src = &ghes_hw.dimms[i]; + + dst->idx = src->idx; + dst->smbios_handle = src->smbios_handle; + dst->nr_pages = src->nr_pages; + dst->mtype = src->mtype; + dst->edac_mode = src->edac_mode; + dst->dtype = src->dtype; + dst->grain = src->grain; + + /* + * If no src->label, preserve default label assigned + * from EDAC core. + */ + if (strlen(src->label)) + memcpy(dst->label, src->label, sizeof(src->label)); + + i++; } + } else { - struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms, - mci->n_layers, 0, 0, 0); + struct dimm_info *dimm = edac_get_dimm(mci, 0, 0, 0); dimm->nr_pages = 1; dimm->grain = 128; @@ -508,30 +479,96 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev) rc = edac_mc_add_mc(mci); if (rc < 0) { - pr_info("Can't register at EDAC core\n"); + pr_info("Can't register with the EDAC core\n"); edac_mc_free(mci); - mutex_unlock(&ghes_edac_lock); + rc = -ENODEV; + goto unlock; + } + + spin_lock_irqsave(&ghes_lock, flags); + ghes_pvt = pvt; + spin_unlock_irqrestore(&ghes_lock, flags); + + ghes_register_report_chain(&ghes_edac_mem_err_nb); + + /* only set on success */ + refcount_set(&ghes_refcount, 1); + +unlock: + + /* Not needed anymore */ + kfree(ghes_hw.dimms); + ghes_hw.dimms = NULL; + + mutex_unlock(&ghes_reg_mutex); + + return rc; +} + +static void ghes_edac_unregister(struct ghes *ghes) +{ + struct mem_ctl_info *mci; + unsigned long flags; + + mutex_lock(&ghes_reg_mutex); + + system_scanned = false; + memset(&ghes_hw, 0, sizeof(struct ghes_hw_desc)); + + if (!refcount_dec_and_test(&ghes_refcount)) + goto unlock; + + /* + * Wait for the irq handler being finished. + */ + spin_lock_irqsave(&ghes_lock, flags); + mci = ghes_pvt ? ghes_pvt->mci : NULL; + ghes_pvt = NULL; + spin_unlock_irqrestore(&ghes_lock, flags); + + if (!mci) + goto unlock; + + mci = edac_mc_del_mc(mci->pdev); + if (mci) + edac_mc_free(mci); + + ghes_unregister_report_chain(&ghes_edac_mem_err_nb); + +unlock: + mutex_unlock(&ghes_reg_mutex); +} + +static int __init ghes_edac_init(void) +{ + struct ghes *g, *g_tmp; + + ghes_devs = ghes_get_devices(); + if (!ghes_devs) return -ENODEV; + + if (list_empty(ghes_devs)) { + pr_info("GHES probing device list is empty\n"); + return -ENODEV; + } + + list_for_each_entry_safe(g, g_tmp, ghes_devs, elist) { + ghes_edac_register(g->dev); } - ghes_edac_mc_num++; - mutex_unlock(&ghes_edac_lock); return 0; } -EXPORT_SYMBOL_GPL(ghes_edac_register); +module_init(ghes_edac_init); -void ghes_edac_unregister(struct ghes *ghes) +static void __exit ghes_edac_exit(void) { - struct mem_ctl_info *mci; - struct ghes_edac_pvt *pvt, *tmp; - - list_for_each_entry_safe(pvt, tmp, &ghes_reglist, list) { - if (ghes == pvt->ghes) { - mci = pvt->mci; - edac_mc_del_mc(mci->pdev); - edac_mc_free(mci); - list_del(&pvt->list); - } + struct ghes *g, *g_tmp; + + list_for_each_entry_safe(g, g_tmp, ghes_devs, elist) { + ghes_edac_unregister(g); } } -EXPORT_SYMBOL_GPL(ghes_edac_unregister); +module_exit(ghes_edac_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Output ACPI APEI/GHES BIOS detected errors via EDAC"); |
