linux/drivers/net/ethernet/mellanox/mlx4/catas.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
   3 * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
   4 *
   5 * This software is available to you under a choice of one of two
   6 * licenses.  You may choose to be licensed under the terms of the GNU
   7 * General Public License (GPL) Version 2, available from the file
   8 * COPYING in the main directory of this source tree, or the
   9 * OpenIB.org BSD license below:
  10 *
  11 *     Redistribution and use in source and binary forms, with or
  12 *     without modification, are permitted provided that the following
  13 *     conditions are met:
  14 *
  15 *      - Redistributions of source code must retain the above
  16 *        copyright notice, this list of conditions and the following
  17 *        disclaimer.
  18 *
  19 *      - Redistributions in binary form must reproduce the above
  20 *        copyright notice, this list of conditions and the following
  21 *        disclaimer in the documentation and/or other materials
  22 *        provided with the distribution.
  23 *
  24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  31 * SOFTWARE.
  32 */
  33
  34#include <linux/workqueue.h>
  35#include <linux/module.h>
  36
  37#include "mlx4.h"
  38
  39enum {
  40        MLX4_CATAS_POLL_INTERVAL        = 5 * HZ,
  41};
  42
  43
  44
  45int mlx4_internal_err_reset = 1;
  46module_param_named(internal_err_reset, mlx4_internal_err_reset,  int, 0644);
  47MODULE_PARM_DESC(internal_err_reset,
  48                 "Reset device on internal errors if non-zero (default 1)");
  49
  50static int read_vendor_id(struct mlx4_dev *dev)
  51{
  52        u16 vendor_id = 0;
  53        int ret;
  54
  55        ret = pci_read_config_word(dev->persist->pdev, 0, &vendor_id);
  56        if (ret) {
  57                mlx4_err(dev, "Failed to read vendor ID, ret=%d\n", ret);
  58                return ret;
  59        }
  60
  61        if (vendor_id == 0xffff) {
  62                mlx4_err(dev, "PCI can't be accessed to read vendor id\n");
  63                return -EINVAL;
  64        }
  65
  66        return 0;
  67}
  68
  69static int mlx4_reset_master(struct mlx4_dev *dev)
  70{
  71        int err = 0;
  72
  73        if (mlx4_is_master(dev))
  74                mlx4_report_internal_err_comm_event(dev);
  75
  76        if (!pci_channel_offline(dev->persist->pdev)) {
  77                err = read_vendor_id(dev);
  78                /* If PCI can't be accessed to read vendor ID we assume that its
  79                 * link was disabled and chip was already reset.
  80                 */
  81                if (err)
  82                        return 0;
  83
  84                err = mlx4_reset(dev);
  85                if (err)
  86                        mlx4_err(dev, "Fail to reset HCA\n");
  87        }
  88
  89        return err;
  90}
  91
  92static int mlx4_reset_slave(struct mlx4_dev *dev)
  93{
  94#define COM_CHAN_RST_REQ_OFFSET 0x10
  95#define COM_CHAN_RST_ACK_OFFSET 0x08
  96
  97        u32 comm_flags;
  98        u32 rst_req;
  99        u32 rst_ack;
 100        unsigned long end;
 101        struct mlx4_priv *priv = mlx4_priv(dev);
 102
 103        if (pci_channel_offline(dev->persist->pdev))
 104                return 0;
 105
 106        comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm +
 107                                  MLX4_COMM_CHAN_FLAGS));
 108        if (comm_flags == 0xffffffff) {
 109                mlx4_err(dev, "VF reset is not needed\n");
 110                return 0;
 111        }
 112
 113        if (!(dev->caps.vf_caps & MLX4_VF_CAP_FLAG_RESET)) {
 114                mlx4_err(dev, "VF reset is not supported\n");
 115                return -EOPNOTSUPP;
 116        }
 117
 118        rst_req = (comm_flags & (u32)(1 << COM_CHAN_RST_REQ_OFFSET)) >>
 119                COM_CHAN_RST_REQ_OFFSET;
 120        rst_ack = (comm_flags & (u32)(1 << COM_CHAN_RST_ACK_OFFSET)) >>
 121                COM_CHAN_RST_ACK_OFFSET;
 122        if (rst_req != rst_ack) {
 123                mlx4_err(dev, "Communication channel isn't sync, fail to send reset\n");
 124                return -EIO;
 125        }
 126
 127        rst_req ^= 1;
 128        mlx4_warn(dev, "VF is sending reset request to Firmware\n");
 129        comm_flags = rst_req << COM_CHAN_RST_REQ_OFFSET;
 130        __raw_writel((__force u32)cpu_to_be32(comm_flags),
 131                     (__iomem char *)priv->mfunc.comm + MLX4_COMM_CHAN_FLAGS);
 132
 133        end = msecs_to_jiffies(MLX4_COMM_TIME) + jiffies;
 134        while (time_before(jiffies, end)) {
 135                comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm +
 136                                          MLX4_COMM_CHAN_FLAGS));
 137                rst_ack = (comm_flags & (u32)(1 << COM_CHAN_RST_ACK_OFFSET)) >>
 138                        COM_CHAN_RST_ACK_OFFSET;
 139
 140                /* Reading rst_req again since the communication channel can
 141                 * be reset at any time by the PF and all its bits will be
 142                 * set to zero.
 143                 */
 144                rst_req = (comm_flags & (u32)(1 << COM_CHAN_RST_REQ_OFFSET)) >>
 145                        COM_CHAN_RST_REQ_OFFSET;
 146
 147                if (rst_ack == rst_req) {
 148                        mlx4_warn(dev, "VF Reset succeed\n");
 149                        return 0;
 150                }
 151                cond_resched();
 152        }
 153        mlx4_err(dev, "Fail to send reset over the communication channel\n");
 154        return -ETIMEDOUT;
 155}
 156
 157int mlx4_comm_internal_err(u32 slave_read)
 158{
 159        return (u32)COMM_CHAN_EVENT_INTERNAL_ERR ==
 160                (slave_read & (u32)COMM_CHAN_EVENT_INTERNAL_ERR) ? 1 : 0;
 161}
 162
 163void mlx4_enter_error_state(struct mlx4_dev_persistent *persist)
 164{
 165        int err;
 166        struct mlx4_dev *dev;
 167
 168        if (!mlx4_internal_err_reset)
 169                return;
 170
 171        mutex_lock(&persist->device_state_mutex);
 172        if (persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
 173                goto out;
 174
 175        dev = persist->dev;
 176        mlx4_err(dev, "device is going to be reset\n");
 177        if (mlx4_is_slave(dev)) {
 178                err = mlx4_reset_slave(dev);
 179        } else {
 180                mlx4_crdump_collect(dev);
 181                err = mlx4_reset_master(dev);
 182        }
 183
 184        if (!err) {
 185                mlx4_err(dev, "device was reset successfully\n");
 186        } else {
 187                /* EEH could have disabled the PCI channel during reset. That's
 188                 * recoverable and the PCI error flow will handle it.
 189                 */
 190                if (!pci_channel_offline(dev->persist->pdev))
 191                        BUG_ON(1);
 192        }
 193        dev->persist->state |= MLX4_DEVICE_STATE_INTERNAL_ERROR;
 194        mutex_unlock(&persist->device_state_mutex);
 195
 196        /* At that step HW was already reset, now notify clients */
 197        mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0);
 198        mlx4_cmd_wake_completions(dev);
 199        return;
 200
 201out:
 202        mutex_unlock(&persist->device_state_mutex);
 203}
 204
 205static void mlx4_handle_error_state(struct mlx4_dev_persistent *persist)
 206{
 207        int err = 0;
 208
 209        mlx4_enter_error_state(persist);
 210        mutex_lock(&persist->interface_state_mutex);
 211        if (persist->interface_state & MLX4_INTERFACE_STATE_UP &&
 212            !(persist->interface_state & MLX4_INTERFACE_STATE_DELETION)) {
 213                err = mlx4_restart_one(persist->pdev);
 214                mlx4_info(persist->dev, "mlx4_restart_one was ended, ret=%d\n",
 215                          err);
 216        }
 217        mutex_unlock(&persist->interface_state_mutex);
 218}
 219
 220static void dump_err_buf(struct mlx4_dev *dev)
 221{
 222        struct mlx4_priv *priv = mlx4_priv(dev);
 223
 224        int i;
 225
 226        mlx4_err(dev, "Internal error detected:\n");
 227        for (i = 0; i < priv->fw.catas_size; ++i)
 228                mlx4_err(dev, "  buf[%02x]: %08x\n",
 229                         i, swab32(readl(priv->catas_err.map + i)));
 230}
 231
 232static void poll_catas(struct timer_list *t)
 233{
 234        struct mlx4_priv *priv = from_timer(priv, t, catas_err.timer);
 235        struct mlx4_dev *dev = &priv->dev;
 236        u32 slave_read;
 237
 238        if (mlx4_is_slave(dev)) {
 239                slave_read = swab32(readl(&priv->mfunc.comm->slave_read));
 240                if (mlx4_comm_internal_err(slave_read)) {
 241                        mlx4_warn(dev, "Internal error detected on the communication channel\n");
 242                        goto internal_err;
 243                }
 244        } else if (readl(priv->catas_err.map)) {
 245                dump_err_buf(dev);
 246                goto internal_err;
 247        }
 248
 249        if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
 250                mlx4_warn(dev, "Internal error mark was detected on device\n");
 251                goto internal_err;
 252        }
 253
 254        mod_timer(&priv->catas_err.timer,
 255                  round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL));
 256        return;
 257
 258internal_err:
 259        if (mlx4_internal_err_reset)
 260                queue_work(dev->persist->catas_wq, &dev->persist->catas_work);
 261}
 262
 263static void catas_reset(struct work_struct *work)
 264{
 265        struct mlx4_dev_persistent *persist =
 266                container_of(work, struct mlx4_dev_persistent,
 267                             catas_work);
 268
 269        mlx4_handle_error_state(persist);
 270}
 271
 272void mlx4_start_catas_poll(struct mlx4_dev *dev)
 273{
 274        struct mlx4_priv *priv = mlx4_priv(dev);
 275        phys_addr_t addr;
 276
 277        INIT_LIST_HEAD(&priv->catas_err.list);
 278        timer_setup(&priv->catas_err.timer, poll_catas, 0);
 279        priv->catas_err.map = NULL;
 280
 281        if (!mlx4_is_slave(dev)) {
 282                addr = pci_resource_start(dev->persist->pdev,
 283                                          priv->fw.catas_bar) +
 284                                          priv->fw.catas_offset;
 285
 286                priv->catas_err.map = ioremap(addr, priv->fw.catas_size * 4);
 287                if (!priv->catas_err.map) {
 288                        mlx4_warn(dev, "Failed to map internal error buffer at 0x%llx\n",
 289                                  (unsigned long long)addr);
 290                        return;
 291                }
 292        }
 293
 294        priv->catas_err.timer.expires  =
 295                round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL);
 296        add_timer(&priv->catas_err.timer);
 297}
 298
 299void mlx4_stop_catas_poll(struct mlx4_dev *dev)
 300{
 301        struct mlx4_priv *priv = mlx4_priv(dev);
 302
 303        del_timer_sync(&priv->catas_err.timer);
 304
 305        if (priv->catas_err.map) {
 306                iounmap(priv->catas_err.map);
 307                priv->catas_err.map = NULL;
 308        }
 309
 310        if (dev->persist->interface_state & MLX4_INTERFACE_STATE_DELETION)
 311                flush_workqueue(dev->persist->catas_wq);
 312}
 313
 314int  mlx4_catas_init(struct mlx4_dev *dev)
 315{
 316        INIT_WORK(&dev->persist->catas_work, catas_reset);
 317        dev->persist->catas_wq = create_singlethread_workqueue("mlx4_health");
 318        if (!dev->persist->catas_wq)
 319                return -ENOMEM;
 320
 321        return 0;
 322}
 323
 324void mlx4_catas_end(struct mlx4_dev *dev)
 325{
 326        if (dev->persist->catas_wq) {
 327                destroy_workqueue(dev->persist->catas_wq);
 328                dev->persist->catas_wq = NULL;
 329        }
 330}
 331