net/mlx5: Handle sync reset request event

Once the driver gets sync_reset_request from firmware it prepares for the coming reset and sends acknowledge. After getting this event the driver expects device reset, either it will trigger PCI reset on sync_reset_now event or such PCI reset will be triggered by another PF of the same device. So it moves to reset requested mode and if it gets PCI reset triggered by the other PF it detect the reset and reloads. Signed-off-by: Moshe Shemesh <moshe@mellanox.com> Reviewed-by: Saeed Mahameed <saeedm@nvidia.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
author: Moshe Shemesh <moshe@mellanox.com> 2020-10-07 09:00:49 +0300
committer: Jakub Kicinski <kuba@kernel.org> 2020-10-09 12:06:52 -0700
commit: 38b9f903f22b9baa5c4b9bfb07c8bbc49f5efbba (patch)
tree: a57130d519916f7a0053972287e2803e1f7a07f1 /drivers/net/ethernet/mellanox/mlx5/core/health.c
parent: e7f4d0bcb8dad6eac657e295b562b1a925d4f3ce (diff)
1 files changed, 20 insertions, 15 deletions
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index b31f769d2df9..54523bed16cd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -110,7 +110,7 @@ static bool sensor_fw_synd_rfr(struct mlx5_core_dev *dev)
 	return rfr && synd;
 }
 
-static u32 check_fatal_sensors(struct mlx5_core_dev *dev)
+u32 mlx5_health_check_fatal_sensors(struct mlx5_core_dev *dev)
 {
 	if (sensor_pci_not_working(dev))
 		return MLX5_SENSOR_PCI_COMM_ERR;
@@ -173,7 +173,7 @@ static bool reset_fw_if_needed(struct mlx5_core_dev *dev)
 	 * Check again to avoid a redundant 2nd reset. If the fatal erros was
 	 * PCI related a reset won't help.
 	 */
-	fatal_error = check_fatal_sensors(dev);
+	fatal_error = mlx5_health_check_fatal_sensors(dev);
 	if (fatal_error == MLX5_SENSOR_PCI_COMM_ERR ||
 	    fatal_error == MLX5_SENSOR_NIC_DISABLED ||
 	    fatal_error == MLX5_SENSOR_NIC_SW_RESET) {
@@ -195,7 +195,7 @@ void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
 	bool err_detected = false;
 
 	/* Mark the device as fatal in order to abort FW commands */
-	if ((check_fatal_sensors(dev) || force) &&
+	if ((mlx5_health_check_fatal_sensors(dev) || force) &&
 	    dev->state == MLX5_DEVICE_STATE_UP) {
 		dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
 		err_detected = true;
@@ -208,7 +208,7 @@ void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
 		goto unlock;
 	}
 
-	if (check_fatal_sensors(dev) || force) { /* protected state setting */
+	if (mlx5_health_check_fatal_sensors(dev) || force) { /* protected state setting */
 		dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
 		mlx5_cmd_flush(dev);
 	}
@@ -231,7 +231,7 @@ void mlx5_error_sw_reset(struct mlx5_core_dev *dev)
 
 	mlx5_core_err(dev, "start\n");
 
-	if (check_fatal_sensors(dev) == MLX5_SENSOR_FW_SYND_RFR) {
+	if (mlx5_health_check_fatal_sensors(dev) == MLX5_SENSOR_FW_SYND_RFR) {
 		/* Get cr-dump and reset FW semaphore */
 		lock = lock_sem_sw_reset(dev, true);
 
@@ -308,26 +308,31 @@ static void mlx5_handle_bad_state(struct mlx5_core_dev *dev)
 
 /* How much time to wait until health resetting the driver (in msecs) */
 #define MLX5_RECOVERY_WAIT_MSECS 60000
-static int mlx5_health_try_recover(struct mlx5_core_dev *dev)
+int mlx5_health_wait_pci_up(struct mlx5_core_dev *dev)
 {
 	unsigned long end;
 
-	mlx5_core_warn(dev, "handling bad device here\n");
-	mlx5_handle_bad_state(dev);
 	end = jiffies + msecs_to_jiffies(MLX5_RECOVERY_WAIT_MSECS);
 	while (sensor_pci_not_working(dev)) {
-		if (time_after(jiffies, end)) {
-			mlx5_core_err(dev,
-				      "health recovery flow aborted, PCI reads still not working\n");
-			return -EIO;
-		}
+		if (time_after(jiffies, end))
+			return -ETIMEDOUT;
 		msleep(100);
 	}
+	return 0;
+}
 
+static int mlx5_health_try_recover(struct mlx5_core_dev *dev)
+{
+	mlx5_core_warn(dev, "handling bad device here\n");
+	mlx5_handle_bad_state(dev);
+	if (mlx5_health_wait_pci_up(dev)) {
+		mlx5_core_err(dev, "health recovery flow aborted, PCI reads still not working\n");
+		return -EIO;
+	}
 	mlx5_core_err(dev, "starting health recovery flow\n");
 	mlx5_recover_device(dev);
 	if (!test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state) ||
-	    check_fatal_sensors(dev)) {
+	    mlx5_health_check_fatal_sensors(dev)) {
 		mlx5_core_err(dev, "health recovery failed\n");
 		return -EIO;
 	}
@@ -696,7 +701,7 @@ static void poll_health(struct timer_list *t)
 	if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
 		goto out;
 
-	fatal_error = check_fatal_sensors(dev);
+	fatal_error = mlx5_health_check_fatal_sensors(dev);
 
 	if (fatal_error && !health->fatal_error) {
 		mlx5_core_err(dev, "Fatal error %u detected\n", fatal_error);
author	Moshe Shemesh <moshe@mellanox.com>	2020-10-07 09:00:49 +0300
committer	Jakub Kicinski <kuba@kernel.org>	2020-10-09 12:06:52 -0700
commit	38b9f903f22b9baa5c4b9bfb07c8bbc49f5efbba (patch)
tree	a57130d519916f7a0053972287e2803e1f7a07f1 /drivers/net/ethernet/mellanox/mlx5/core/health.c
parent	e7f4d0bcb8dad6eac657e295b562b1a925d4f3ce (diff)