summaryrefslogtreecommitdiff
path: root/drivers/net/ethernet/mellanox/mlx5/core/health.c
diff options
context:
space:
mode:
authorMoshe Shemesh <moshe@mellanox.com>2020-10-07 09:00:49 +0300
committerJakub Kicinski <kuba@kernel.org>2020-10-09 12:06:52 -0700
commit38b9f903f22b9baa5c4b9bfb07c8bbc49f5efbba (patch)
treea57130d519916f7a0053972287e2803e1f7a07f1 /drivers/net/ethernet/mellanox/mlx5/core/health.c
parente7f4d0bcb8dad6eac657e295b562b1a925d4f3ce (diff)
net/mlx5: Handle sync reset request event
Once the driver gets sync_reset_request from firmware it prepares for the coming reset and sends acknowledge. After getting this event the driver expects device reset, either it will trigger PCI reset on sync_reset_now event or such PCI reset will be triggered by another PF of the same device. So it moves to reset requested mode and if it gets PCI reset triggered by the other PF it detect the reset and reloads. Signed-off-by: Moshe Shemesh <moshe@mellanox.com> Reviewed-by: Saeed Mahameed <saeedm@nvidia.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'drivers/net/ethernet/mellanox/mlx5/core/health.c')
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/health.c35
1 files changed, 20 insertions, 15 deletions
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index b31f769d2df9..54523bed16cd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -110,7 +110,7 @@ static bool sensor_fw_synd_rfr(struct mlx5_core_dev *dev)
return rfr && synd;
}
-static u32 check_fatal_sensors(struct mlx5_core_dev *dev)
+u32 mlx5_health_check_fatal_sensors(struct mlx5_core_dev *dev)
{
if (sensor_pci_not_working(dev))
return MLX5_SENSOR_PCI_COMM_ERR;
@@ -173,7 +173,7 @@ static bool reset_fw_if_needed(struct mlx5_core_dev *dev)
* Check again to avoid a redundant 2nd reset. If the fatal erros was
* PCI related a reset won't help.
*/
- fatal_error = check_fatal_sensors(dev);
+ fatal_error = mlx5_health_check_fatal_sensors(dev);
if (fatal_error == MLX5_SENSOR_PCI_COMM_ERR ||
fatal_error == MLX5_SENSOR_NIC_DISABLED ||
fatal_error == MLX5_SENSOR_NIC_SW_RESET) {
@@ -195,7 +195,7 @@ void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
bool err_detected = false;
/* Mark the device as fatal in order to abort FW commands */
- if ((check_fatal_sensors(dev) || force) &&
+ if ((mlx5_health_check_fatal_sensors(dev) || force) &&
dev->state == MLX5_DEVICE_STATE_UP) {
dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
err_detected = true;
@@ -208,7 +208,7 @@ void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
goto unlock;
}
- if (check_fatal_sensors(dev) || force) { /* protected state setting */
+ if (mlx5_health_check_fatal_sensors(dev) || force) { /* protected state setting */
dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
mlx5_cmd_flush(dev);
}
@@ -231,7 +231,7 @@ void mlx5_error_sw_reset(struct mlx5_core_dev *dev)
mlx5_core_err(dev, "start\n");
- if (check_fatal_sensors(dev) == MLX5_SENSOR_FW_SYND_RFR) {
+ if (mlx5_health_check_fatal_sensors(dev) == MLX5_SENSOR_FW_SYND_RFR) {
/* Get cr-dump and reset FW semaphore */
lock = lock_sem_sw_reset(dev, true);
@@ -308,26 +308,31 @@ static void mlx5_handle_bad_state(struct mlx5_core_dev *dev)
/* How much time to wait until health resetting the driver (in msecs) */
#define MLX5_RECOVERY_WAIT_MSECS 60000
-static int mlx5_health_try_recover(struct mlx5_core_dev *dev)
+int mlx5_health_wait_pci_up(struct mlx5_core_dev *dev)
{
unsigned long end;
- mlx5_core_warn(dev, "handling bad device here\n");
- mlx5_handle_bad_state(dev);
end = jiffies + msecs_to_jiffies(MLX5_RECOVERY_WAIT_MSECS);
while (sensor_pci_not_working(dev)) {
- if (time_after(jiffies, end)) {
- mlx5_core_err(dev,
- "health recovery flow aborted, PCI reads still not working\n");
- return -EIO;
- }
+ if (time_after(jiffies, end))
+ return -ETIMEDOUT;
msleep(100);
}
+ return 0;
+}
+static int mlx5_health_try_recover(struct mlx5_core_dev *dev)
+{
+ mlx5_core_warn(dev, "handling bad device here\n");
+ mlx5_handle_bad_state(dev);
+ if (mlx5_health_wait_pci_up(dev)) {
+ mlx5_core_err(dev, "health recovery flow aborted, PCI reads still not working\n");
+ return -EIO;
+ }
mlx5_core_err(dev, "starting health recovery flow\n");
mlx5_recover_device(dev);
if (!test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state) ||
- check_fatal_sensors(dev)) {
+ mlx5_health_check_fatal_sensors(dev)) {
mlx5_core_err(dev, "health recovery failed\n");
return -EIO;
}
@@ -696,7 +701,7 @@ static void poll_health(struct timer_list *t)
if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
goto out;
- fatal_error = check_fatal_sensors(dev);
+ fatal_error = mlx5_health_check_fatal_sensors(dev);
if (fatal_error && !health->fatal_error) {
mlx5_core_err(dev, "Fatal error %u detected\n", fatal_error);