summaryrefslogtreecommitdiff
path: root/drivers/net/ethernet/mellanox/mlx5/core/health.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/net/ethernet/mellanox/mlx5/core/health.c')
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/health.c147
1 files changed, 116 insertions, 31 deletions
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 037e18dd4be0..64f1abc4dc36 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -36,14 +36,15 @@
#include <linux/vmalloc.h>
#include <linux/hardirq.h>
#include <linux/mlx5/driver.h>
+#include <linux/kern_levels.h>
#include "mlx5_core.h"
#include "lib/eq.h"
#include "lib/mlx5.h"
#include "lib/pci_vsc.h"
+#include "lib/tout.h"
#include "diag/fw_tracer.h"
enum {
- MLX5_HEALTH_POLL_INTERVAL = 2 * HZ,
MAX_MISSES = 3,
};
@@ -74,6 +75,11 @@ enum {
MLX5_SENSOR_FW_SYND_RFR = 5,
};
+enum {
+ MLX5_SEVERITY_MASK = 0x7,
+ MLX5_SEVERITY_VALID_MASK = 0x8,
+};
+
u8 mlx5_get_nic_state(struct mlx5_core_dev *dev)
{
return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 7;
@@ -98,12 +104,19 @@ static bool sensor_pci_not_working(struct mlx5_core_dev *dev)
return (ioread32be(&h->fw_ver) == 0xffffffff);
}
+static int mlx5_health_get_rfr(u8 rfr_severity)
+{
+ return rfr_severity >> MLX5_RFR_BIT_OFFSET;
+}
+
static bool sensor_fw_synd_rfr(struct mlx5_core_dev *dev)
{
struct mlx5_core_health *health = &dev->priv.health;
struct health_buffer __iomem *h = health->health;
- u32 rfr = ioread32be(&h->rfr) >> MLX5_RFR_OFFSET;
u8 synd = ioread8(&h->synd);
+ u8 rfr;
+
+ rfr = mlx5_health_get_rfr(ioread8(&h->rfr_severity));
if (rfr && synd)
mlx5_core_dbg(dev, "FW requests reset, synd: %d\n", synd);
@@ -219,11 +232,9 @@ unlock:
mutex_unlock(&dev->intf_state_mutex);
}
-#define MLX5_CRDUMP_WAIT_MS 60000
-#define MLX5_FW_RESET_WAIT_MS 1000
void mlx5_error_sw_reset(struct mlx5_core_dev *dev)
{
- unsigned long end, delay_ms = MLX5_FW_RESET_WAIT_MS;
+ unsigned long end, delay_ms = mlx5_tout_ms(dev, PCI_TOGGLE);
int lock = -EBUSY;
mutex_lock(&dev->intf_state_mutex);
@@ -237,7 +248,7 @@ void mlx5_error_sw_reset(struct mlx5_core_dev *dev)
lock = lock_sem_sw_reset(dev, true);
if (lock == -EBUSY) {
- delay_ms = MLX5_CRDUMP_WAIT_MS;
+ delay_ms = mlx5_tout_ms(dev, FULL_CRDUMP);
goto recover_from_sw_reset;
}
/* Execute SW reset */
@@ -307,13 +318,11 @@ static void mlx5_handle_bad_state(struct mlx5_core_dev *dev)
mlx5_disable_device(dev);
}
-/* How much time to wait until health resetting the driver (in msecs) */
-#define MLX5_RECOVERY_WAIT_MSECS 60000
int mlx5_health_wait_pci_up(struct mlx5_core_dev *dev)
{
unsigned long end;
- end = jiffies + msecs_to_jiffies(MLX5_RECOVERY_WAIT_MSECS);
+ end = jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, FW_RESET));
while (sensor_pci_not_working(dev)) {
if (time_after(jiffies, end))
return -ETIMEDOUT;
@@ -370,35 +379,69 @@ static const char *hsynd_str(u8 synd)
}
}
+static const char *mlx5_loglevel_str(int level)
+{
+ switch (level) {
+ case LOGLEVEL_EMERG:
+ return "EMERGENCY";
+ case LOGLEVEL_ALERT:
+ return "ALERT";
+ case LOGLEVEL_CRIT:
+ return "CRITICAL";
+ case LOGLEVEL_ERR:
+ return "ERROR";
+ case LOGLEVEL_WARNING:
+ return "WARNING";
+ case LOGLEVEL_NOTICE:
+ return "NOTICE";
+ case LOGLEVEL_INFO:
+ return "INFO";
+ case LOGLEVEL_DEBUG:
+ return "DEBUG";
+ }
+ return "Unknown log level";
+}
+
+static int mlx5_health_get_severity(u8 rfr_severity)
+{
+ return rfr_severity & MLX5_SEVERITY_VALID_MASK ?
+ rfr_severity & MLX5_SEVERITY_MASK : LOGLEVEL_ERR;
+}
+
static void print_health_info(struct mlx5_core_dev *dev)
{
struct mlx5_core_health *health = &dev->priv.health;
struct health_buffer __iomem *h = health->health;
- char fw_str[18];
- u32 fw;
+ u8 rfr_severity;
+ int severity;
int i;
/* If the syndrome is 0, the device is OK and no need to print buffer */
if (!ioread8(&h->synd))
return;
+ rfr_severity = ioread8(&h->rfr_severity);
+ severity = mlx5_health_get_severity(rfr_severity);
+ mlx5_log(dev, severity, "Health issue observed, %s, severity(%d) %s:\n",
+ hsynd_str(ioread8(&h->synd)), severity, mlx5_loglevel_str(severity));
+
for (i = 0; i < ARRAY_SIZE(h->assert_var); i++)
- mlx5_core_err(dev, "assert_var[%d] 0x%08x\n", i,
- ioread32be(h->assert_var + i));
-
- mlx5_core_err(dev, "assert_exit_ptr 0x%08x\n",
- ioread32be(&h->assert_exit_ptr));
- mlx5_core_err(dev, "assert_callra 0x%08x\n",
- ioread32be(&h->assert_callra));
- sprintf(fw_str, "%d.%d.%d", fw_rev_maj(dev), fw_rev_min(dev), fw_rev_sub(dev));
- mlx5_core_err(dev, "fw_ver %s\n", fw_str);
- mlx5_core_err(dev, "hw_id 0x%08x\n", ioread32be(&h->hw_id));
- mlx5_core_err(dev, "irisc_index %d\n", ioread8(&h->irisc_index));
- mlx5_core_err(dev, "synd 0x%x: %s\n", ioread8(&h->synd),
- hsynd_str(ioread8(&h->synd)));
- mlx5_core_err(dev, "ext_synd 0x%04x\n", ioread16be(&h->ext_synd));
- fw = ioread32be(&h->fw_ver);
- mlx5_core_err(dev, "raw fw_ver 0x%08x\n", fw);
+ mlx5_log(dev, severity, "assert_var[%d] 0x%08x\n", i,
+ ioread32be(h->assert_var + i));
+
+ mlx5_log(dev, severity, "assert_exit_ptr 0x%08x\n", ioread32be(&h->assert_exit_ptr));
+ mlx5_log(dev, severity, "assert_callra 0x%08x\n", ioread32be(&h->assert_callra));
+ mlx5_log(dev, severity, "fw_ver %d.%d.%d", fw_rev_maj(dev), fw_rev_min(dev),
+ fw_rev_sub(dev));
+ mlx5_log(dev, severity, "time %u\n", ioread32be(&h->time));
+ mlx5_log(dev, severity, "hw_id 0x%08x\n", ioread32be(&h->hw_id));
+ mlx5_log(dev, severity, "rfr %d\n", mlx5_health_get_rfr(rfr_severity));
+ mlx5_log(dev, severity, "severity %d (%s)\n", severity, mlx5_loglevel_str(severity));
+ mlx5_log(dev, severity, "irisc_index %d\n", ioread8(&h->irisc_index));
+ mlx5_log(dev, severity, "synd 0x%x: %s\n", ioread8(&h->synd),
+ hsynd_str(ioread8(&h->synd)));
+ mlx5_log(dev, severity, "ext_synd 0x%04x\n", ioread16be(&h->ext_synd));
+ mlx5_log(dev, severity, "raw fw_ver 0x%08x\n", ioread32be(&h->fw_ver));
}
static int
@@ -447,6 +490,7 @@ mlx5_fw_reporter_heath_buffer_data_put(struct mlx5_core_dev *dev,
{
struct mlx5_core_health *health = &dev->priv.health;
struct health_buffer __iomem *h = health->health;
+ u8 rfr_severity;
int err;
int i;
@@ -479,9 +523,19 @@ mlx5_fw_reporter_heath_buffer_data_put(struct mlx5_core_dev *dev,
ioread32be(&h->assert_callra));
if (err)
return err;
+ err = devlink_fmsg_u32_pair_put(fmsg, "time", ioread32be(&h->time));
+ if (err)
+ return err;
err = devlink_fmsg_u32_pair_put(fmsg, "hw_id", ioread32be(&h->hw_id));
if (err)
return err;
+ rfr_severity = ioread8(&h->rfr_severity);
+ err = devlink_fmsg_u8_pair_put(fmsg, "rfr", mlx5_health_get_rfr(rfr_severity));
+ if (err)
+ return err;
+ err = devlink_fmsg_u8_pair_put(fmsg, "severity", mlx5_health_get_severity(rfr_severity));
+ if (err)
+ return err;
err = devlink_fmsg_u8_pair_put(fmsg, "irisc_index",
ioread8(&h->irisc_index));
if (err)
@@ -674,13 +728,13 @@ static void mlx5_fw_reporters_destroy(struct mlx5_core_dev *dev)
devlink_health_reporter_destroy(health->fw_fatal_reporter);
}
-static unsigned long get_next_poll_jiffies(void)
+static unsigned long get_next_poll_jiffies(struct mlx5_core_dev *dev)
{
unsigned long next;
get_random_bytes(&next, sizeof(next));
next %= HZ;
- next += jiffies + MLX5_HEALTH_POLL_INTERVAL;
+ next += jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, HEALTH_POLL_INTERVAL));
return next;
}
@@ -698,6 +752,31 @@ void mlx5_trigger_health_work(struct mlx5_core_dev *dev)
spin_unlock_irqrestore(&health->wq_lock, flags);
}
+#define MLX5_MSEC_PER_HOUR (MSEC_PER_SEC * 60 * 60)
+static void mlx5_health_log_ts_update(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ u32 out[MLX5_ST_SZ_DW(mrtc_reg)] = {};
+ u32 in[MLX5_ST_SZ_DW(mrtc_reg)] = {};
+ struct mlx5_core_health *health;
+ struct mlx5_core_dev *dev;
+ struct mlx5_priv *priv;
+ u64 now_us;
+
+ health = container_of(dwork, struct mlx5_core_health, update_fw_log_ts_work);
+ priv = container_of(health, struct mlx5_priv, health);
+ dev = container_of(priv, struct mlx5_core_dev, priv);
+
+ now_us = ktime_to_us(ktime_get_real());
+
+ MLX5_SET(mrtc_reg, in, time_h, now_us >> 32);
+ MLX5_SET(mrtc_reg, in, time_l, now_us & 0xFFFFFFFF);
+ mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), MLX5_REG_MRTC, 0, 1);
+
+ queue_delayed_work(health->wq, &health->update_fw_log_ts_work,
+ msecs_to_jiffies(MLX5_MSEC_PER_HOUR));
+}
+
static void poll_health(struct timer_list *t)
{
struct mlx5_core_dev *dev = from_timer(dev, t, priv.health.timer);
@@ -740,11 +819,12 @@ static void poll_health(struct timer_list *t)
queue_work(health->wq, &health->report_work);
out:
- mod_timer(&health->timer, get_next_poll_jiffies());
+ mod_timer(&health->timer, get_next_poll_jiffies(dev));
}
void mlx5_start_health_poll(struct mlx5_core_dev *dev)
{
+ u64 poll_interval_ms = mlx5_tout_ms(dev, HEALTH_POLL_INTERVAL);
struct mlx5_core_health *health = &dev->priv.health;
timer_setup(&health->timer, poll_health, 0);
@@ -753,7 +833,7 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev)
health->health = &dev->iseg->health;
health->health_counter = &dev->iseg->health_counter;
- health->timer.expires = round_jiffies(jiffies + MLX5_HEALTH_POLL_INTERVAL);
+ health->timer.expires = jiffies + msecs_to_jiffies(poll_interval_ms);
add_timer(&health->timer);
}
@@ -779,6 +859,7 @@ void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
spin_lock_irqsave(&health->wq_lock, flags);
set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
spin_unlock_irqrestore(&health->wq_lock, flags);
+ cancel_delayed_work_sync(&health->update_fw_log_ts_work);
cancel_work_sync(&health->report_work);
cancel_work_sync(&health->fatal_report_work);
}
@@ -794,6 +875,7 @@ void mlx5_health_cleanup(struct mlx5_core_dev *dev)
{
struct mlx5_core_health *health = &dev->priv.health;
+ cancel_delayed_work_sync(&health->update_fw_log_ts_work);
destroy_workqueue(health->wq);
mlx5_fw_reporters_destroy(dev);
}
@@ -819,6 +901,9 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
spin_lock_init(&health->wq_lock);
INIT_WORK(&health->fatal_report_work, mlx5_fw_fatal_reporter_err_work);
INIT_WORK(&health->report_work, mlx5_fw_reporter_err_work);
+ INIT_DELAYED_WORK(&health->update_fw_log_ts_work, mlx5_health_log_ts_update);
+ if (mlx5_core_is_pf(dev))
+ queue_delayed_work(health->wq, &health->update_fw_log_ts_work, 0);
return 0;