linux/drivers/net/ethernet/mellanox/mlx5/core/health.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
   3 *
   4 * This software is available to you under a choice of one of two
   5 * licenses.  You may choose to be licensed under the terms of the GNU
   6 * General Public License (GPL) Version 2, available from the file
   7 * COPYING in the main directory of this source tree, or the
   8 * OpenIB.org BSD license below:
   9 *
  10 *     Redistribution and use in source and binary forms, with or
  11 *     without modification, are permitted provided that the following
  12 *     conditions are met:
  13 *
  14 *      - Redistributions of source code must retain the above
  15 *        copyright notice, this list of conditions and the following
  16 *        disclaimer.
  17 *
  18 *      - Redistributions in binary form must reproduce the above
  19 *        copyright notice, this list of conditions and the following
  20 *        disclaimer in the documentation and/or other materials
  21 *        provided with the distribution.
  22 *
  23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  30 * SOFTWARE.
  31 */
  32
  33#include <linux/kernel.h>
  34#include <linux/module.h>
  35#include <linux/random.h>
  36#include <linux/vmalloc.h>
  37#include <linux/mlx5/driver.h>
  38#include <linux/mlx5/cmd.h>
  39#include "mlx5_core.h"
  40
  41enum {
  42        MLX5_HEALTH_POLL_INTERVAL       = 2 * HZ,
  43        MAX_MISSES                      = 3,
  44};
  45
  46enum {
  47        MLX5_HEALTH_SYNDR_FW_ERR                = 0x1,
  48        MLX5_HEALTH_SYNDR_IRISC_ERR             = 0x7,
  49        MLX5_HEALTH_SYNDR_CRC_ERR               = 0x9,
  50        MLX5_HEALTH_SYNDR_FETCH_PCI_ERR         = 0xa,
  51        MLX5_HEALTH_SYNDR_HW_FTL_ERR            = 0xb,
  52        MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR  = 0xc,
  53        MLX5_HEALTH_SYNDR_EQ_ERR                = 0xd,
  54        MLX5_HEALTH_SYNDR_FFSER_ERR             = 0xf,
  55};
  56
  57static DEFINE_SPINLOCK(health_lock);
  58static LIST_HEAD(health_list);
  59static struct work_struct health_work;
  60
  61static void health_care(struct work_struct *work)
  62{
  63        struct mlx5_core_health *health, *n;
  64        struct mlx5_core_dev *dev;
  65        struct mlx5_priv *priv;
  66        LIST_HEAD(tlist);
  67
  68        spin_lock_irq(&health_lock);
  69        list_splice_init(&health_list, &tlist);
  70
  71        spin_unlock_irq(&health_lock);
  72
  73        list_for_each_entry_safe(health, n, &tlist, list) {
  74                priv = container_of(health, struct mlx5_priv, health);
  75                dev = container_of(priv, struct mlx5_core_dev, priv);
  76                mlx5_core_warn(dev, "handling bad device here\n");
  77                /* nothing yet */
  78                spin_lock_irq(&health_lock);
  79                list_del_init(&health->list);
  80                spin_unlock_irq(&health_lock);
  81        }
  82}
  83
  84static const char *hsynd_str(u8 synd)
  85{
  86        switch (synd) {
  87        case MLX5_HEALTH_SYNDR_FW_ERR:
  88                return "firmware internal error";
  89        case MLX5_HEALTH_SYNDR_IRISC_ERR:
  90                return "irisc not responding";
  91        case MLX5_HEALTH_SYNDR_CRC_ERR:
  92                return "firmware CRC error";
  93        case MLX5_HEALTH_SYNDR_FETCH_PCI_ERR:
  94                return "ICM fetch PCI error";
  95        case MLX5_HEALTH_SYNDR_HW_FTL_ERR:
  96                return "HW fatal error\n";
  97        case MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR:
  98                return "async EQ buffer overrun";
  99        case MLX5_HEALTH_SYNDR_EQ_ERR:
 100                return "EQ error";
 101        case MLX5_HEALTH_SYNDR_FFSER_ERR:
 102                return "FFSER error";
 103        default:
 104                return "unrecognized error";
 105        }
 106}
 107
 108static u16 read_be16(__be16 __iomem *p)
 109{
 110        return swab16(readl((__force u16 __iomem *) p));
 111}
 112
 113static u32 read_be32(__be32 __iomem *p)
 114{
 115        return swab32(readl((__force u32 __iomem *) p));
 116}
 117
 118static void print_health_info(struct mlx5_core_dev *dev)
 119{
 120        struct mlx5_core_health *health = &dev->priv.health;
 121        struct health_buffer __iomem *h = health->health;
 122        int i;
 123
 124        for (i = 0; i < ARRAY_SIZE(h->assert_var); i++)
 125                pr_info("assert_var[%d] 0x%08x\n", i, read_be32(h->assert_var + i));
 126
 127        pr_info("assert_exit_ptr 0x%08x\n", read_be32(&h->assert_exit_ptr));
 128        pr_info("assert_callra 0x%08x\n", read_be32(&h->assert_callra));
 129        pr_info("fw_ver 0x%08x\n", read_be32(&h->fw_ver));
 130        pr_info("hw_id 0x%08x\n", read_be32(&h->hw_id));
 131        pr_info("irisc_index %d\n", readb(&h->irisc_index));
 132        pr_info("synd 0x%x: %s\n", readb(&h->synd), hsynd_str(readb(&h->synd)));
 133        pr_info("ext_sync 0x%04x\n", read_be16(&h->ext_sync));
 134}
 135
 136static void poll_health(unsigned long data)
 137{
 138        struct mlx5_core_dev *dev = (struct mlx5_core_dev *)data;
 139        struct mlx5_core_health *health = &dev->priv.health;
 140        unsigned long next;
 141        u32 count;
 142
 143        count = ioread32be(health->health_counter);
 144        if (count == health->prev)
 145                ++health->miss_counter;
 146        else
 147                health->miss_counter = 0;
 148
 149        health->prev = count;
 150        if (health->miss_counter == MAX_MISSES) {
 151                mlx5_core_err(dev, "device's health compromised\n");
 152                print_health_info(dev);
 153                spin_lock_irq(&health_lock);
 154                list_add_tail(&health->list, &health_list);
 155                spin_unlock_irq(&health_lock);
 156
 157                queue_work(mlx5_core_wq, &health_work);
 158        } else {
 159                get_random_bytes(&next, sizeof(next));
 160                next %= HZ;
 161                next += jiffies + MLX5_HEALTH_POLL_INTERVAL;
 162                mod_timer(&health->timer, next);
 163        }
 164}
 165
 166void mlx5_start_health_poll(struct mlx5_core_dev *dev)
 167{
 168        struct mlx5_core_health *health = &dev->priv.health;
 169
 170        INIT_LIST_HEAD(&health->list);
 171        init_timer(&health->timer);
 172        health->health = &dev->iseg->health;
 173        health->health_counter = &dev->iseg->health_counter;
 174
 175        health->timer.data = (unsigned long)dev;
 176        health->timer.function = poll_health;
 177        health->timer.expires = round_jiffies(jiffies + MLX5_HEALTH_POLL_INTERVAL);
 178        add_timer(&health->timer);
 179}
 180
 181void mlx5_stop_health_poll(struct mlx5_core_dev *dev)
 182{
 183        struct mlx5_core_health *health = &dev->priv.health;
 184
 185        del_timer_sync(&health->timer);
 186
 187        spin_lock_irq(&health_lock);
 188        if (!list_empty(&health->list))
 189                list_del_init(&health->list);
 190        spin_unlock_irq(&health_lock);
 191}
 192
 193void mlx5_health_cleanup(void)
 194{
 195}
 196
 197void  __init mlx5_health_init(void)
 198{
 199        INIT_WORK(&health_work, health_care);
 200}
 201