powerpc/eeh: Use a goto for recovery failures

The EEH recovery logic in eeh_handle_normal_event() has some pretty strange flow control. If we remove all the actual recovery logic we're left with the following skeleton: if (result != PCI_ERS_RESULT_DISCONNECT) { ... } if (result != PCI_ERS_RESULT_DISCONNECT) { ... } if (result == PCI_ERS_RESULT_NONE) { ... } if (result == PCI_ERS_RESULT_CAN_RECOVER) { ... } if (result == PCI_ERS_RESULT_CAN_RECOVER) { ... } if (result == PCI_ERS_RESULT_NEED_RESET) { ... } if ((result == PCI_ERS_RESULT_RECOVERED) || (result == PCI_ERS_RESULT_NONE)) { ... goto out; } /* * unsuccessful recovery / PCI_ERS_RESULT_DISCONECTED * handling is here. */ ... out: ... Most of the "if () { ... }" blocks above change "result" to PCI_ERS_RESULT_DISCONNECTED if an error occurs in that recovery step. This makes the control flow a bit confusing since it breaks the early-exit pattern that is generally used in Linux. In any case we end up handling the error in the final else block so why not just jump there directly? Doing so also allows us to de-indent a bunch of code. No functional changes. [dja: rebase on top of linux-next + my preceeding refactor, move clearing the EEH_DEV_NO_HANDLER bit above the first goto so that it is always clear in the error handler code as it was before.] Signed-off-by: Oliver O'Halloran <oohall@gmail.com> Signed-off-by: Daniel Axtens <dja@axtens.net> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20211015070628.1331635-2-dja@axtens.net
author: Oliver O'Halloran <oohall@gmail.com> 2021-10-15 18:06:28 +1100
committer: Michael Ellerman <mpe@ellerman.id.au> 2021-11-25 11:25:31 +1100
commit: 157616f3c2284f13ca7db9897293f944e6ab8199 (patch)
tree: aae731749eb06686634e823ba046fcaf7e7fd2ca
parent: 10b34ece132ee46dc4e6459c765d180c422a09fa (diff)
1 files changed, 45 insertions, 48 deletions
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index b676fc079356..422f80b5b27b 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -905,18 +905,19 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
 	}
 #endif /* CONFIG_STACKTRACE */
 
+	eeh_for_each_pe(pe, tmp_pe)
+		eeh_pe_for_each_dev(tmp_pe, edev, tmp)
+			edev->mode &= ~EEH_DEV_NO_HANDLER;
+
 	eeh_pe_update_time_stamp(pe);
 	pe->freeze_count++;
 	if (pe->freeze_count > eeh_max_freezes) {
 		pr_err("EEH: PHB#%x-PE#%x has failed %d times in the last hour and has been permanently disabled.\n",
 		       pe->phb->global_number, pe->addr,
 		       pe->freeze_count);
-		result = PCI_ERS_RESULT_DISCONNECT;
-	}
 
-	eeh_for_each_pe(pe, tmp_pe)
-		eeh_pe_for_each_dev(tmp_pe, edev, tmp)
-			edev->mode &= ~EEH_DEV_NO_HANDLER;
+		goto recover_failed;
+	}
 
 	/* Walk the various device drivers attached to this slot through
 	 * a reset sequence, giving each an opportunity to do what it needs
@@ -928,39 +929,38 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
 	 * the error. Override the result if necessary to have partially
 	 * hotplug for this case.
 	 */
-	if (result != PCI_ERS_RESULT_DISCONNECT) {
-		pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n",
-			pe->freeze_count, eeh_max_freezes);
-		pr_info("EEH: Notify device drivers to shutdown\n");
-		eeh_set_channel_state(pe, pci_channel_io_frozen);
-		eeh_set_irq_state(pe, false);
-		eeh_pe_report("error_detected(IO frozen)", pe,
-			      eeh_report_error, &result);
-		if ((pe->type & EEH_PE_PHB) &&
-		    result != PCI_ERS_RESULT_NONE &&
-		    result != PCI_ERS_RESULT_NEED_RESET)
-			result = PCI_ERS_RESULT_NEED_RESET;
-	}
+	pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n",
+		pe->freeze_count, eeh_max_freezes);
+	pr_info("EEH: Notify device drivers to shutdown\n");
+	eeh_set_channel_state(pe, pci_channel_io_frozen);
+	eeh_set_irq_state(pe, false);
+	eeh_pe_report("error_detected(IO frozen)", pe,
+		      eeh_report_error, &result);
+	if (result == PCI_ERS_RESULT_DISCONNECT)
+		goto recover_failed;
+
+	/*
+	 * Error logged on a PHB are always fences which need a full
+	 * PHB reset to clear so force that to happen.
+	 */
+	if ((pe->type & EEH_PE_PHB) && result != PCI_ERS_RESULT_NONE)
+		result = PCI_ERS_RESULT_NEED_RESET;
 
 	/* Get the current PCI slot state. This can take a long time,
 	 * sometimes over 300 seconds for certain systems.
 	 */
-	if (result != PCI_ERS_RESULT_DISCONNECT) {
-		rc = eeh_wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000);
-		if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) {
-			pr_warn("EEH: Permanent failure\n");
-			result = PCI_ERS_RESULT_DISCONNECT;
-		}
+	rc = eeh_wait_state(pe, MAX_WAIT_FOR_RECOVERY * 1000);
+	if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) {
+		pr_warn("EEH: Permanent failure\n");
+		goto recover_failed;
 	}
 
 	/* Since rtas may enable MMIO when posting the error log,
 	 * don't post the error log until after all dev drivers
 	 * have been informed.
 	 */
-	if (result != PCI_ERS_RESULT_DISCONNECT) {
-		pr_info("EEH: Collect temporary log\n");
-		eeh_slot_error_detail(pe, EEH_LOG_TEMP);
-	}
+	pr_info("EEH: Collect temporary log\n");
+	eeh_slot_error_detail(pe, EEH_LOG_TEMP);
 
 	/* If all device drivers were EEH-unaware, then shut
 	 * down all of the device drivers, and hope they
@@ -970,9 +970,8 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
 		pr_info("EEH: Reset with hotplug activity\n");
 		rc = eeh_reset_device(pe, bus, NULL, false);
 		if (rc) {
-			pr_warn("%s: Unable to reset, err=%d\n",
-				__func__, rc);
-			result = PCI_ERS_RESULT_DISCONNECT;
+			pr_warn("%s: Unable to reset, err=%d\n", __func__, rc);
+			goto recover_failed;
 		}
 	}
 
@@ -980,10 +979,10 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
 	if (result == PCI_ERS_RESULT_CAN_RECOVER) {
 		pr_info("EEH: Enable I/O for affected devices\n");
 		rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
+		if (rc < 0)
+			goto recover_failed;
 
-		if (rc < 0) {
-			result = PCI_ERS_RESULT_DISCONNECT;
-		} else if (rc) {
+		if (rc) {
 			result = PCI_ERS_RESULT_NEED_RESET;
 		} else {
 			pr_info("EEH: Notify device drivers to resume I/O\n");
@@ -991,15 +990,13 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
 				      eeh_report_mmio_enabled, &result);
 		}
 	}
-
-	/* If all devices reported they can proceed, then re-enable DMA */
 	if (result == PCI_ERS_RESULT_CAN_RECOVER) {
 		pr_info("EEH: Enabled DMA for affected devices\n");
 		rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA);
+		if (rc < 0)
+			goto recover_failed;
 
-		if (rc < 0) {
-			result = PCI_ERS_RESULT_DISCONNECT;
-		} else if (rc) {
+		if (rc) {
 			result = PCI_ERS_RESULT_NEED_RESET;
 		} else {
 			/*
@@ -1017,16 +1014,15 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
 		pr_info("EEH: Reset without hotplug activity\n");
 		rc = eeh_reset_device(pe, bus, &rmv_data, true);
 		if (rc) {
-			pr_warn("%s: Cannot reset, err=%d\n",
-				__func__, rc);
-			result = PCI_ERS_RESULT_DISCONNECT;
-		} else {
-			result = PCI_ERS_RESULT_NONE;
-			eeh_set_channel_state(pe, pci_channel_io_normal);
-			eeh_set_irq_state(pe, true);
-			eeh_pe_report("slot_reset", pe, eeh_report_reset,
-				      &result);
+			pr_warn("%s: Cannot reset, err=%d\n", __func__, rc);
+			goto recover_failed;
 		}
+
+		result = PCI_ERS_RESULT_NONE;
+		eeh_set_channel_state(pe, pci_channel_io_normal);
+		eeh_set_irq_state(pe, true);
+		eeh_pe_report("slot_reset", pe, eeh_report_reset,
+			      &result);
 	}
 
 	if ((result == PCI_ERS_RESULT_RECOVERED) ||
@@ -1057,6 +1053,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
 		goto out;
 	}
 
+recover_failed:
 	/*
 	 * About 90% of all real-life EEH failures in the field
 	 * are due to poorly seated PCI cards. Only 10% or so are
author	Oliver O'Halloran <oohall@gmail.com>	2021-10-15 18:06:28 +1100
committer	Michael Ellerman <mpe@ellerman.id.au>	2021-11-25 11:25:31 +1100
commit	157616f3c2284f13ca7db9897293f944e6ab8199 (patch)
tree	aae731749eb06686634e823ba046fcaf7e7fd2ca
parent	10b34ece132ee46dc4e6459c765d180c422a09fa (diff)