From cb464ba53c0cb497dcb4a3daaf4fad4b75291863 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Mon, 11 Oct 2021 13:14:28 +0300 Subject: net/mlx5: Extend health buffer dump Enhance health buffer to include: - assert_var5: expose the 6'th assert variable. - time: error's time-stamp in seconds (epoch time). - rfr: Recovery Flow Requiered. When set, indicates that the error cannot be recovered without flow involving reset. - severity: error's severity value, ranging from emergency to debug. Expose them in the health buffer dump (dmesg and devlink fw reporter). Health buffer in dmesg: mlx5_core 0000:08:00.0: print_health_info:425:(pid 912): Health issue observed, firmware internal error, severity(3) ERROR: mlx5_core 0000:08:00.0: print_health_info:429:(pid 912): assert_var[0] 0x08040700 mlx5_core 0000:08:00.0: print_health_info:429:(pid 912): assert_var[1] 0x00000000 mlx5_core 0000:08:00.0: print_health_info:429:(pid 912): assert_var[2] 0x00000000 mlx5_core 0000:08:00.0: print_health_info:429:(pid 912): assert_var[3] 0x00000000 mlx5_core 0000:08:00.0: print_health_info:429:(pid 912): assert_var[4] 0x00000000 mlx5_core 0000:08:00.0: print_health_info:429:(pid 912): assert_var[5] 0x00000000 mlx5_core 0000:08:00.0: print_health_info:432:(pid 912): assert_exit_ptr 0x00aaf800 mlx5_core 0000:08:00.0: print_health_info:434:(pid 912): assert_callra 0x00aaf70c mlx5_core 0000:08:00.0: print_health_info:436:(pid 912): fw_ver 16.32.492 mlx5_core 0000:08:00.0: print_health_info:437:(pid 912): time 1634819758 mlx5_core 0000:08:00.0: print_health_info:438:(pid 912): hw_id 0x0000020d mlx5_core 0000:08:00.0: print_health_info:439:(pid 912): rfr 0 mlx5_core 0000:08:00.0: print_health_info:440:(pid 912): severity 3 (ERROR) mlx5_core 0000:08:00.0: print_health_info:441:(pid 912): irisc_index 9 mlx5_core 0000:08:00.0: print_health_info:442:(pid 912): synd 0x1: firmware internal error mlx5_core 0000:08:00.0: print_health_info:444:(pid 912): ext_synd 0x802b mlx5_core 0000:08:00.0: print_health_info:445:(pid 912): raw fw_ver 0x102001ec Signed-off-by: Aya Levin Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/health.c | 73 +++++++++++++++++++++--- 1 file changed, 66 insertions(+), 7 deletions(-) (limited to 'drivers/net/ethernet/mellanox/mlx5/core/health.c') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index 6a4dd7f78958..538ef392f54c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "mlx5_core.h" #include "lib/eq.h" #include "lib/mlx5.h" @@ -74,6 +75,11 @@ enum { MLX5_SENSOR_FW_SYND_RFR = 5, }; +enum { + MLX5_SEVERITY_MASK = 0x7, + MLX5_SEVERITY_VALID_MASK = 0x8, +}; + u8 mlx5_get_nic_state(struct mlx5_core_dev *dev) { return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 7; @@ -98,12 +104,19 @@ static bool sensor_pci_not_working(struct mlx5_core_dev *dev) return (ioread32be(&h->fw_ver) == 0xffffffff); } +static int mlx5_health_get_rfr(u8 rfr_severity) +{ + return rfr_severity >> MLX5_RFR_BIT_OFFSET; +} + static bool sensor_fw_synd_rfr(struct mlx5_core_dev *dev) { struct mlx5_core_health *health = &dev->priv.health; struct health_buffer __iomem *h = health->health; - u32 rfr = ioread32be(&h->rfr) >> MLX5_RFR_OFFSET; u8 synd = ioread8(&h->synd); + u8 rfr; + + rfr = mlx5_health_get_rfr(ioread8(&h->rfr_severity)); if (rfr && synd) mlx5_core_dbg(dev, "FW requests reset, synd: %d\n", synd); @@ -366,18 +379,52 @@ static const char *hsynd_str(u8 synd) } } +static const char *mlx5_loglevel_str(int level) +{ + switch (level) { + case LOGLEVEL_EMERG: + return "EMERGENCY"; + case LOGLEVEL_ALERT: + return "ALERT"; + case LOGLEVEL_CRIT: + return "CRITICAL"; + case LOGLEVEL_ERR: + return "ERROR"; + case LOGLEVEL_WARNING: + return "WARNING"; + case LOGLEVEL_NOTICE: + return "NOTICE"; + case LOGLEVEL_INFO: + return "INFO"; + case LOGLEVEL_DEBUG: + return "DEBUG"; + } + return "Unknown log level"; +} + +static int mlx5_health_get_severity(u8 rfr_severity) +{ + return rfr_severity & MLX5_SEVERITY_VALID_MASK ? + rfr_severity & MLX5_SEVERITY_MASK : LOGLEVEL_ERR; +} + static void print_health_info(struct mlx5_core_dev *dev) { struct mlx5_core_health *health = &dev->priv.health; struct health_buffer __iomem *h = health->health; - char fw_str[18]; - u32 fw; + u8 rfr_severity; + int severity; int i; /* If the syndrome is 0, the device is OK and no need to print buffer */ if (!ioread8(&h->synd)) return; + rfr_severity = ioread8(&h->rfr_severity); + severity = mlx5_health_get_severity(rfr_severity); + mlx5_core_err(dev, "Health issue observed, %s, severity(%d) %s:\n", + hsynd_str(ioread8(&h->synd)), severity, mlx5_loglevel_str(severity)); + for (i = 0; i < ARRAY_SIZE(h->assert_var); i++) mlx5_core_err(dev, "assert_var[%d] 0x%08x\n", i, ioread32be(h->assert_var + i)); @@ -386,15 +433,16 @@ static void print_health_info(struct mlx5_core_dev *dev) ioread32be(&h->assert_exit_ptr)); mlx5_core_err(dev, "assert_callra 0x%08x\n", ioread32be(&h->assert_callra)); - sprintf(fw_str, "%d.%d.%d", fw_rev_maj(dev), fw_rev_min(dev), fw_rev_sub(dev)); - mlx5_core_err(dev, "fw_ver %s\n", fw_str); + mlx5_core_err(dev, "fw_ver %d.%d.%d", fw_rev_maj(dev), fw_rev_min(dev), fw_rev_sub(dev)); + mlx5_core_err(dev, "time %u\n", ioread32be(&h->time)); mlx5_core_err(dev, "hw_id 0x%08x\n", ioread32be(&h->hw_id)); + mlx5_core_err(dev, "rfr %d\n", mlx5_health_get_rfr(rfr_severity)); + mlx5_core_err(dev, "severity %d (%s)\n", severity, mlx5_loglevel_str(severity)); mlx5_core_err(dev, "irisc_index %d\n", ioread8(&h->irisc_index)); mlx5_core_err(dev, "synd 0x%x: %s\n", ioread8(&h->synd), hsynd_str(ioread8(&h->synd))); mlx5_core_err(dev, "ext_synd 0x%04x\n", ioread16be(&h->ext_synd)); - fw = ioread32be(&h->fw_ver); - mlx5_core_err(dev, "raw fw_ver 0x%08x\n", fw); + mlx5_core_err(dev, "raw fw_ver 0x%08x\n", ioread32be(&h->fw_ver)); } static int @@ -443,6 +491,7 @@ mlx5_fw_reporter_heath_buffer_data_put(struct mlx5_core_dev *dev, { struct mlx5_core_health *health = &dev->priv.health; struct health_buffer __iomem *h = health->health; + u8 rfr_severity; int err; int i; @@ -473,9 +522,19 @@ mlx5_fw_reporter_heath_buffer_data_put(struct mlx5_core_dev *dev, return err; err = devlink_fmsg_u32_pair_put(fmsg, "assert_callra", ioread32be(&h->assert_callra)); + if (err) + return err; + err = devlink_fmsg_u32_pair_put(fmsg, "time", ioread32be(&h->time)); if (err) return err; err = devlink_fmsg_u32_pair_put(fmsg, "hw_id", ioread32be(&h->hw_id)); + if (err) + return err; + rfr_severity = ioread8(&h->rfr_severity); + err = devlink_fmsg_u8_pair_put(fmsg, "rfr", mlx5_health_get_rfr(rfr_severity)); + if (err) + return err; + err = devlink_fmsg_u8_pair_put(fmsg, "severity", mlx5_health_get_severity(rfr_severity)); if (err) return err; err = devlink_fmsg_u8_pair_put(fmsg, "irisc_index", -- cgit