linux/drivers/net/ethernet/mellanox/mlx4/catas.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
   3 * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
   4 *
   5 * This software is available to you under a choice of one of two
   6 * licenses.  You may choose to be licensed under the terms of the GNU
   7 * General Public License (GPL) Version 2, available from the file
   8 * COPYING in the main directory of this source tree, or the
   9 * OpenIB.org BSD license below:
  10 *
  11 *     Redistribution and use in source and binary forms, with or
  12 *     without modification, are permitted provided that the following
  13 *     conditions are met:
  14 *
  15 *      - Redistributions of source code must retain the above
  16 *        copyright notice, this list of conditions and the following
  17 *        disclaimer.
  18 *
  19 *      - Redistributions in binary form must reproduce the above
  20 *        copyright notice, this list of conditions and the following
  21 *        disclaimer in the documentation and/or other materials
  22 *        provided with the distribution.
  23 *
  24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  31 * SOFTWARE.
  32 */
  33
  34#include <linux/workqueue.h>
  35#include <linux/module.h>
  36
  37#include "mlx4.h"
  38
  39enum {
  40        MLX4_CATAS_POLL_INTERVAL        = 5 * HZ,
  41};
  42
  43
  44
  45int mlx4_internal_err_reset = 1;
  46module_param_named(internal_err_reset, mlx4_internal_err_reset,  int, 0644);
  47MODULE_PARM_DESC(internal_err_reset,
  48                 "Reset device on internal errors if non-zero (default 1)");
  49
  50static int read_vendor_id(struct mlx4_dev *dev)
  51{
  52        u16 vendor_id = 0;
  53        int ret;
  54
  55        ret = pci_read_config_word(dev->persist->pdev, 0, &vendor_id);
  56        if (ret) {
  57                mlx4_err(dev, "Failed to read vendor ID, ret=%d\n", ret);
  58                return ret;
  59        }
  60
  61        if (vendor_id == 0xffff) {
  62                mlx4_err(dev, "PCI can't be accessed to read vendor id\n");
  63                return -EINVAL;
  64        }
  65
  66        return 0;
  67}
  68
  69static int mlx4_reset_master(struct mlx4_dev *dev)
  70{
  71        int err = 0;
  72
  73        if (mlx4_is_master(dev))
  74                mlx4_report_internal_err_comm_event(dev);
  75
  76        if (!pci_channel_offline(dev->persist->pdev)) {
  77                err = read_vendor_id(dev);
  78                /* If PCI can't be accessed to read vendor ID we assume that its
  79                 * link was disabled and chip was already reset.
  80                 */
  81                if (err)
  82                        return 0;
  83
  84                err = mlx4_reset(dev);
  85                if (err)
  86                        mlx4_err(dev, "Fail to reset HCA\n");
  87        }
  88
  89        return err;
  90}
  91
  92static int mlx4_reset_slave(struct mlx4_dev *dev)
  93{
  94#define COM_CHAN_RST_REQ_OFFSET 0x10
  95#define COM_CHAN_RST_ACK_OFFSET 0x08
  96
  97        u32 comm_flags;
  98        u32 rst_req;
  99        u32 rst_ack;
 100        unsigned long end;
 101        struct mlx4_priv *priv = mlx4_priv(dev);
 102
 103        if (pci_channel_offline(dev->persist->pdev))
 104                return 0;
 105
 106        comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm +
 107                                  MLX4_COMM_CHAN_FLAGS));
 108        if (comm_flags == 0xffffffff) {
 109                mlx4_err(dev, "VF reset is not needed\n");
 110                return 0;
 111        }
 112
 113        if (!(dev->caps.vf_caps & MLX4_VF_CAP_FLAG_RESET)) {
 114                mlx4_err(dev, "VF reset is not supported\n");
 115                return -EOPNOTSUPP;
 116        }
 117
 118        rst_req = (comm_flags & (u32)(1 << COM_CHAN_RST_REQ_OFFSET)) >>
 119                COM_CHAN_RST_REQ_OFFSET;
 120        rst_ack = (comm_flags & (u32)(1 << COM_CHAN_RST_ACK_OFFSET)) >>
 121                COM_CHAN_RST_ACK_OFFSET;
 122        if (rst_req != rst_ack) {
 123                mlx4_err(dev, "Communication channel isn't sync, fail to send reset\n");
 124                return -EIO;
 125        }
 126
 127        rst_req ^= 1;
 128        mlx4_warn(dev, "VF is sending reset request to Firmware\n");
 129        comm_flags = rst_req << COM_CHAN_RST_REQ_OFFSET;
 130        __raw_writel((__force u32)cpu_to_be32(comm_flags),
 131                     (__iomem char *)priv->mfunc.comm + MLX4_COMM_CHAN_FLAGS);
 132        /* Make sure that our comm channel write doesn't
 133         * get mixed in with writes from another CPU.
 134         */
 135        mmiowb();
 136
 137        end = msecs_to_jiffies(MLX4_COMM_TIME) + jiffies;
 138        while (time_before(jiffies, end)) {
 139                comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm +
 140                                          MLX4_COMM_CHAN_FLAGS));
 141                rst_ack = (comm_flags & (u32)(1 << COM_CHAN_RST_ACK_OFFSET)) >>
 142                        COM_CHAN_RST_ACK_OFFSET;
 143
 144                /* Reading rst_req again since the communication channel can
 145                 * be reset at any time by the PF and all its bits will be
 146                 * set to zero.
 147                 */
 148                rst_req = (comm_flags & (u32)(1 << COM_CHAN_RST_REQ_OFFSET)) >>
 149                        COM_CHAN_RST_REQ_OFFSET;
 150
 151                if (rst_ack == rst_req) {
 152                        mlx4_warn(dev, "VF Reset succeed\n");
 153                        return 0;
 154                }
 155                cond_resched();
 156        }
 157        mlx4_err(dev, "Fail to send reset over the communication channel\n");
 158        return -ETIMEDOUT;
 159}
 160
 161int mlx4_comm_internal_err(u32 slave_read)
 162{
 163        return (u32)COMM_CHAN_EVENT_INTERNAL_ERR ==
 164                (slave_read & (u32)COMM_CHAN_EVENT_INTERNAL_ERR) ? 1 : 0;
 165}
 166
 167void mlx4_enter_error_state(struct mlx4_dev_persistent *persist)
 168{
 169        int err;
 170        struct mlx4_dev *dev;
 171
 172        if (!mlx4_internal_err_reset)
 173                return;
 174
 175        mutex_lock(&persist->device_state_mutex);
 176        if (persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
 177                goto out;
 178
 179        dev = persist->dev;
 180        mlx4_err(dev, "device is going to be reset\n");
 181        if (mlx4_is_slave(dev))
 182                err = mlx4_reset_slave(dev);
 183        else
 184                err = mlx4_reset_master(dev);
 185
 186        if (!err) {
 187                mlx4_err(dev, "device was reset successfully\n");
 188        } else {
 189                /* EEH could have disabled the PCI channel during reset. That's
 190                 * recoverable and the PCI error flow will handle it.
 191                 */
 192                if (!pci_channel_offline(dev->persist->pdev))
 193                        BUG_ON(1);
 194        }
 195        dev->persist->state |= MLX4_DEVICE_STATE_INTERNAL_ERROR;
 196        mutex_unlock(&persist->device_state_mutex);
 197
 198        /* At that step HW was already reset, now notify clients */
 199        mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0);
 200        mlx4_cmd_wake_completions(dev);
 201        return;
 202
 203out:
 204        mutex_unlock(&persist->device_state_mutex);
 205}
 206
 207static void mlx4_handle_error_state(struct mlx4_dev_persistent *persist)
 208{
 209        int err = 0;
 210
 211        mlx4_enter_error_state(persist);
 212        mutex_lock(&persist->interface_state_mutex);
 213        if (persist->interface_state & MLX4_INTERFACE_STATE_UP &&
 214            !(persist->interface_state & MLX4_INTERFACE_STATE_DELETION)) {
 215                err = mlx4_restart_one(persist->pdev);
 216                mlx4_info(persist->dev, "mlx4_restart_one was ended, ret=%d\n",
 217                          err);
 218        }
 219        mutex_unlock(&persist->interface_state_mutex);
 220}
 221
 222static void dump_err_buf(struct mlx4_dev *dev)
 223{
 224        struct mlx4_priv *priv = mlx4_priv(dev);
 225
 226        int i;
 227
 228        mlx4_err(dev, "Internal error detected:\n");
 229        for (i = 0; i < priv->fw.catas_size; ++i)
 230                mlx4_err(dev, "  buf[%02x]: %08x\n",
 231                         i, swab32(readl(priv->catas_err.map + i)));
 232}
 233
 234static void poll_catas(struct timer_list *t)
 235{
 236        struct mlx4_priv *priv = from_timer(priv, t, catas_err.timer);
 237        struct mlx4_dev *dev = &priv->dev;
 238        u32 slave_read;
 239
 240        if (mlx4_is_slave(dev)) {
 241                slave_read = swab32(readl(&priv->mfunc.comm->slave_read));
 242                if (mlx4_comm_internal_err(slave_read)) {
 243                        mlx4_warn(dev, "Internal error detected on the communication channel\n");
 244                        goto internal_err;
 245                }
 246        } else if (readl(priv->catas_err.map)) {
 247                dump_err_buf(dev);
 248                goto internal_err;
 249        }
 250
 251        if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
 252                mlx4_warn(dev, "Internal error mark was detected on device\n");
 253                goto internal_err;
 254        }
 255
 256        mod_timer(&priv->catas_err.timer,
 257                  round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL));
 258        return;
 259
 260internal_err:
 261        if (mlx4_internal_err_reset)
 262                queue_work(dev->persist->catas_wq, &dev->persist->catas_work);
 263}
 264
 265static void catas_reset(struct work_struct *work)
 266{
 267        struct mlx4_dev_persistent *persist =
 268                container_of(work, struct mlx4_dev_persistent,
 269                             catas_work);
 270
 271        mlx4_handle_error_state(persist);
 272}
 273
 274void mlx4_start_catas_poll(struct mlx4_dev *dev)
 275{
 276        struct mlx4_priv *priv = mlx4_priv(dev);
 277        phys_addr_t addr;
 278
 279        INIT_LIST_HEAD(&priv->catas_err.list);
 280        timer_setup(&priv->catas_err.timer, poll_catas, 0);
 281        priv->catas_err.map = NULL;
 282
 283        if (!mlx4_is_slave(dev)) {
 284                addr = pci_resource_start(dev->persist->pdev,
 285                                          priv->fw.catas_bar) +
 286                                          priv->fw.catas_offset;
 287
 288                priv->catas_err.map = ioremap(addr, priv->fw.catas_size * 4);
 289                if (!priv->catas_err.map) {
 290                        mlx4_warn(dev, "Failed to map internal error buffer at 0x%llx\n",
 291                                  (unsigned long long)addr);
 292                        return;
 293                }
 294        }
 295
 296        priv->catas_err.timer.expires  =
 297                round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL);
 298        add_timer(&priv->catas_err.timer);
 299}
 300
 301void mlx4_stop_catas_poll(struct mlx4_dev *dev)
 302{
 303        struct mlx4_priv *priv = mlx4_priv(dev);
 304
 305        del_timer_sync(&priv->catas_err.timer);
 306
 307        if (priv->catas_err.map) {
 308                iounmap(priv->catas_err.map);
 309                priv->catas_err.map = NULL;
 310        }
 311
 312        if (dev->persist->interface_state & MLX4_INTERFACE_STATE_DELETION)
 313                flush_workqueue(dev->persist->catas_wq);
 314}
 315
 316int  mlx4_catas_init(struct mlx4_dev *dev)
 317{
 318        INIT_WORK(&dev->persist->catas_work, catas_reset);
 319        dev->persist->catas_wq = create_singlethread_workqueue("mlx4_health");
 320        if (!dev->persist->catas_wq)
 321                return -ENOMEM;
 322
 323        return 0;
 324}
 325
 326void mlx4_catas_end(struct mlx4_dev *dev)
 327{
 328        if (dev->persist->catas_wq) {
 329                destroy_workqueue(dev->persist->catas_wq);
 330                dev->persist->catas_wq = NULL;
 331        }
 332}
 333