diff options
Diffstat (limited to 'arch/powerpc/kernel/eeh_driver.c')
-rw-r--r-- | arch/powerpc/kernel/eeh_driver.c | 220 |
1 files changed, 105 insertions, 115 deletions
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 3dd1a422fc29..48773d2d9be3 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -16,7 +16,6 @@ #include <asm/eeh_event.h> #include <asm/ppc-pci.h> #include <asm/pci-bridge.h> -#include <asm/prom.h> #include <asm/rtas.h> struct eeh_rmv_data { @@ -40,7 +39,7 @@ static int eeh_result_priority(enum pci_ers_result result) case PCI_ERS_RESULT_NEED_RESET: return 6; default: - WARN_ONCE(1, "Unknown pci_ers_result value: %d\n", (int)result); + WARN_ONCE(1, "Unknown pci_ers_result value: %d\n", result); return 0; } }; @@ -61,7 +60,7 @@ static const char *pci_ers_result_name(enum pci_ers_result result) case PCI_ERS_RESULT_NO_AER_DRIVER: return "no AER driver"; default: - WARN_ONCE(1, "Unknown result type: %d\n", (int)result); + WARN_ONCE(1, "Unknown result type: %d\n", result); return "unknown"; } }; @@ -104,13 +103,13 @@ static bool eeh_edev_actionable(struct eeh_dev *edev) */ static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev) { - if (!pdev || !pdev->driver) + if (!pdev || !pdev->dev.driver) return NULL; - if (!try_module_get(pdev->driver->driver.owner)) + if (!try_module_get(pdev->dev.driver->owner)) return NULL; - return pdev->driver; + return to_pci_driver(pdev->dev.driver); } /** @@ -122,10 +121,10 @@ static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev) */ static inline void eeh_pcid_put(struct pci_dev *pdev) { - if (!pdev || !pdev->driver) + if (!pdev || !pdev->dev.driver) return; - module_put(pdev->driver->driver.owner); + module_put(pdev->dev.driver->owner); } /** @@ -214,7 +213,7 @@ static void eeh_dev_save_state(struct eeh_dev *edev, void *userdata) pci_save_state(pdev); } -static void eeh_set_channel_state(struct eeh_pe *root, enum pci_channel_state s) +static void eeh_set_channel_state(struct eeh_pe *root, pci_channel_state_t s) { struct eeh_pe *pe; struct eeh_dev *edev, *tmp; @@ -425,8 +424,8 @@ static enum pci_ers_result eeh_report_resume(struct eeh_dev *edev, pci_uevent_ers(edev->pdev, PCI_ERS_RESULT_RECOVERED); #ifdef CONFIG_PCI_IOV - if (eeh_ops->notify_resume && eeh_dev_to_pdn(edev)) - eeh_ops->notify_resume(eeh_dev_to_pdn(edev)); + if (eeh_ops->notify_resume) + eeh_ops->notify_resume(edev); #endif return PCI_ERS_RESULT_NONE; } @@ -477,7 +476,7 @@ static void *eeh_add_virt_device(struct eeh_dev *edev) } #ifdef CONFIG_PCI_IOV - pci_iov_add_virtfn(edev->physfn, eeh_dev_to_pdn(edev)->vf_index); + pci_iov_add_virtfn(edev->physfn, edev->vf_index); #endif return NULL; } @@ -521,16 +520,8 @@ static void eeh_rmv_device(struct eeh_dev *edev, void *userdata) if (edev->physfn) { #ifdef CONFIG_PCI_IOV - struct pci_dn *pdn = eeh_dev_to_pdn(edev); - - pci_iov_remove_virtfn(edev->physfn, pdn->vf_index); + pci_iov_remove_virtfn(edev->physfn, edev->vf_index); edev->pdev = NULL; - - /* - * We have to set the VF PE number to invalid one, which is - * required to plug the VF successfully. - */ - pdn->pe_number = IODA_INVALID_PE; #endif if (rmv_data) list_add(&edev->rmv_entry, &rmv_data->removed_vf_list); @@ -550,7 +541,7 @@ static void *eeh_pe_detach_dev(struct eeh_pe *pe, void *userdata) continue; edev->mode &= ~(EEH_DEV_DISCONNECTED | EEH_DEV_IRQ_DISABLED); - eeh_rmv_from_parent_pe(edev); + eeh_pe_tree_remove(edev); } return NULL; @@ -759,7 +750,7 @@ static void eeh_pe_cleanup(struct eeh_pe *pe) * @pdev: pci_dev to check * * This function may return a false positive if we can't determine the slot's - * presence state. This might happen for for PCIe slots if the PE containing + * presence state. This might happen for PCIe slots if the PE containing * the upstream bridge is also frozen, or the bridge is part of the same PE * as the device. * @@ -913,18 +904,19 @@ void eeh_handle_normal_event(struct eeh_pe *pe) } #endif /* CONFIG_STACKTRACE */ + eeh_for_each_pe(pe, tmp_pe) + eeh_pe_for_each_dev(tmp_pe, edev, tmp) + edev->mode &= ~EEH_DEV_NO_HANDLER; + eeh_pe_update_time_stamp(pe); pe->freeze_count++; if (pe->freeze_count > eeh_max_freezes) { pr_err("EEH: PHB#%x-PE#%x has failed %d times in the last hour and has been permanently disabled.\n", pe->phb->global_number, pe->addr, pe->freeze_count); - result = PCI_ERS_RESULT_DISCONNECT; - } - eeh_for_each_pe(pe, tmp_pe) - eeh_pe_for_each_dev(tmp_pe, edev, tmp) - edev->mode &= ~EEH_DEV_NO_HANDLER; + goto recover_failed; + } /* Walk the various device drivers attached to this slot through * a reset sequence, giving each an opportunity to do what it needs @@ -936,39 +928,38 @@ void eeh_handle_normal_event(struct eeh_pe *pe) * the error. Override the result if necessary to have partially * hotplug for this case. */ - if (result != PCI_ERS_RESULT_DISCONNECT) { - pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n", - pe->freeze_count, eeh_max_freezes); - pr_info("EEH: Notify device drivers to shutdown\n"); - eeh_set_channel_state(pe, pci_channel_io_frozen); - eeh_set_irq_state(pe, false); - eeh_pe_report("error_detected(IO frozen)", pe, - eeh_report_error, &result); - if ((pe->type & EEH_PE_PHB) && - result != PCI_ERS_RESULT_NONE && - result != PCI_ERS_RESULT_NEED_RESET) - result = PCI_ERS_RESULT_NEED_RESET; - } + pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n", + pe->freeze_count, eeh_max_freezes); + pr_info("EEH: Notify device drivers to shutdown\n"); + eeh_set_channel_state(pe, pci_channel_io_frozen); + eeh_set_irq_state(pe, false); + eeh_pe_report("error_detected(IO frozen)", pe, + eeh_report_error, &result); + if (result == PCI_ERS_RESULT_DISCONNECT) + goto recover_failed; + + /* + * Error logged on a PHB are always fences which need a full + * PHB reset to clear so force that to happen. + */ + if ((pe->type & EEH_PE_PHB) && result != PCI_ERS_RESULT_NONE) + result = PCI_ERS_RESULT_NEED_RESET; /* Get the current PCI slot state. This can take a long time, * sometimes over 300 seconds for certain systems. */ - if (result != PCI_ERS_RESULT_DISCONNECT) { - rc = eeh_wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000); - if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { - pr_warn("EEH: Permanent failure\n"); - result = PCI_ERS_RESULT_DISCONNECT; - } + rc = eeh_wait_state(pe, MAX_WAIT_FOR_RECOVERY * 1000); + if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { + pr_warn("EEH: Permanent failure\n"); + goto recover_failed; } /* Since rtas may enable MMIO when posting the error log, * don't post the error log until after all dev drivers * have been informed. */ - if (result != PCI_ERS_RESULT_DISCONNECT) { - pr_info("EEH: Collect temporary log\n"); - eeh_slot_error_detail(pe, EEH_LOG_TEMP); - } + pr_info("EEH: Collect temporary log\n"); + eeh_slot_error_detail(pe, EEH_LOG_TEMP); /* If all device drivers were EEH-unaware, then shut * down all of the device drivers, and hope they @@ -978,9 +969,8 @@ void eeh_handle_normal_event(struct eeh_pe *pe) pr_info("EEH: Reset with hotplug activity\n"); rc = eeh_reset_device(pe, bus, NULL, false); if (rc) { - pr_warn("%s: Unable to reset, err=%d\n", - __func__, rc); - result = PCI_ERS_RESULT_DISCONNECT; + pr_warn("%s: Unable to reset, err=%d\n", __func__, rc); + goto recover_failed; } } @@ -988,10 +978,10 @@ void eeh_handle_normal_event(struct eeh_pe *pe) if (result == PCI_ERS_RESULT_CAN_RECOVER) { pr_info("EEH: Enable I/O for affected devices\n"); rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); + if (rc < 0) + goto recover_failed; - if (rc < 0) { - result = PCI_ERS_RESULT_DISCONNECT; - } else if (rc) { + if (rc) { result = PCI_ERS_RESULT_NEED_RESET; } else { pr_info("EEH: Notify device drivers to resume I/O\n"); @@ -999,15 +989,13 @@ void eeh_handle_normal_event(struct eeh_pe *pe) eeh_report_mmio_enabled, &result); } } - - /* If all devices reported they can proceed, then re-enable DMA */ if (result == PCI_ERS_RESULT_CAN_RECOVER) { pr_info("EEH: Enabled DMA for affected devices\n"); rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA); + if (rc < 0) + goto recover_failed; - if (rc < 0) { - result = PCI_ERS_RESULT_DISCONNECT; - } else if (rc) { + if (rc) { result = PCI_ERS_RESULT_NEED_RESET; } else { /* @@ -1025,16 +1013,15 @@ void eeh_handle_normal_event(struct eeh_pe *pe) pr_info("EEH: Reset without hotplug activity\n"); rc = eeh_reset_device(pe, bus, &rmv_data, true); if (rc) { - pr_warn("%s: Cannot reset, err=%d\n", - __func__, rc); - result = PCI_ERS_RESULT_DISCONNECT; - } else { - result = PCI_ERS_RESULT_NONE; - eeh_set_channel_state(pe, pci_channel_io_normal); - eeh_set_irq_state(pe, true); - eeh_pe_report("slot_reset", pe, eeh_report_reset, - &result); + pr_warn("%s: Cannot reset, err=%d\n", __func__, rc); + goto recover_failed; } + + result = PCI_ERS_RESULT_NONE; + eeh_set_channel_state(pe, pci_channel_io_normal); + eeh_set_irq_state(pe, true); + eeh_pe_report("slot_reset", pe, eeh_report_reset, + &result); } if ((result == PCI_ERS_RESULT_RECOVERED) || @@ -1062,45 +1049,47 @@ void eeh_handle_normal_event(struct eeh_pe *pe) } pr_info("EEH: Recovery successful.\n"); - } else { - /* - * About 90% of all real-life EEH failures in the field - * are due to poorly seated PCI cards. Only 10% or so are - * due to actual, failed cards. - */ - pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n" - "Please try reseating or replacing it\n", - pe->phb->global_number, pe->addr); + goto out; + } - eeh_slot_error_detail(pe, EEH_LOG_PERM); +recover_failed: + /* + * About 90% of all real-life EEH failures in the field + * are due to poorly seated PCI cards. Only 10% or so are + * due to actual, failed cards. + */ + pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n" + "Please try reseating or replacing it\n", + pe->phb->global_number, pe->addr); - /* Notify all devices that they're about to go down. */ - eeh_set_channel_state(pe, pci_channel_io_perm_failure); - eeh_set_irq_state(pe, false); - eeh_pe_report("error_detected(permanent failure)", pe, - eeh_report_failure, NULL); + eeh_slot_error_detail(pe, EEH_LOG_PERM); - /* Mark the PE to be removed permanently */ - eeh_pe_state_mark(pe, EEH_PE_REMOVED); + /* Notify all devices that they're about to go down. */ + eeh_set_irq_state(pe, false); + eeh_pe_report("error_detected(permanent failure)", pe, + eeh_report_failure, NULL); + eeh_set_channel_state(pe, pci_channel_io_perm_failure); - /* - * Shut down the device drivers for good. We mark - * all removed devices correctly to avoid access - * the their PCI config any more. - */ - if (pe->type & EEH_PE_VF) { - eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL); - eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); - } else { - eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); - eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); + /* Mark the PE to be removed permanently */ + eeh_pe_state_mark(pe, EEH_PE_REMOVED); - pci_lock_rescan_remove(); - pci_hp_remove_devices(bus); - pci_unlock_rescan_remove(); - /* The passed PE should no longer be used */ - return; - } + /* + * Shut down the device drivers for good. We mark + * all removed devices correctly to avoid access + * the their PCI config any more. + */ + if (pe->type & EEH_PE_VF) { + eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL); + eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); + } else { + eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); + eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); + + pci_lock_rescan_remove(); + pci_hp_remove_devices(bus); + pci_unlock_rescan_remove(); + /* The passed PE should no longer be used */ + return; } out: @@ -1190,6 +1179,17 @@ void eeh_handle_special_event(void) eeh_pe_state_mark(pe, EEH_PE_RECOVERING); eeh_handle_normal_event(pe); } else { + eeh_for_each_pe(pe, tmp_pe) + eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev) + edev->mode &= ~EEH_DEV_NO_HANDLER; + + /* Notify all devices to be down */ + eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); + eeh_pe_report( + "error_detected(permanent failure)", pe, + eeh_report_failure, NULL); + eeh_set_channel_state(pe, pci_channel_io_perm_failure); + pci_lock_rescan_remove(); list_for_each_entry(hose, &hose_list, list_node) { phb_pe = eeh_phb_pe_get(hose); @@ -1198,16 +1198,6 @@ void eeh_handle_special_event(void) (phb_pe->state & EEH_PE_RECOVERING)) continue; - eeh_for_each_pe(pe, tmp_pe) - eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev) - edev->mode &= ~EEH_DEV_NO_HANDLER; - - /* Notify all devices to be down */ - eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); - eeh_set_channel_state(pe, pci_channel_io_perm_failure); - eeh_pe_report( - "error_detected(permanent failure)", pe, - eeh_report_failure, NULL); bus = eeh_pe_bus_get(phb_pe); if (!bus) { pr_err("%s: Cannot find PCI bus for " |