linux/drivers/pci/pcie/err.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * This file implements the error recovery as a core part of PCIe error
   4 * reporting. When a PCIe error is delivered, an error message will be
   5 * collected and printed to console, then, an error recovery procedure
   6 * will be executed by following the PCI error recovery rules.
   7 *
   8 * Copyright (C) 2006 Intel Corp.
   9 *      Tom Long Nguyen (tom.l.nguyen@intel.com)
  10 *      Zhang Yanmin (yanmin.zhang@intel.com)
  11 */
  12
  13#include <linux/pci.h>
  14#include <linux/module.h>
  15#include <linux/pci.h>
  16#include <linux/kernel.h>
  17#include <linux/errno.h>
  18#include <linux/aer.h>
  19#include "portdrv.h"
  20#include "../pci.h"
  21
  22struct aer_broadcast_data {
  23        enum pci_channel_state state;
  24        enum pci_ers_result result;
  25};
  26
  27static pci_ers_result_t merge_result(enum pci_ers_result orig,
  28                                  enum pci_ers_result new)
  29{
  30        if (new == PCI_ERS_RESULT_NO_AER_DRIVER)
  31                return PCI_ERS_RESULT_NO_AER_DRIVER;
  32
  33        if (new == PCI_ERS_RESULT_NONE)
  34                return orig;
  35
  36        switch (orig) {
  37        case PCI_ERS_RESULT_CAN_RECOVER:
  38        case PCI_ERS_RESULT_RECOVERED:
  39                orig = new;
  40                break;
  41        case PCI_ERS_RESULT_DISCONNECT:
  42                if (new == PCI_ERS_RESULT_NEED_RESET)
  43                        orig = PCI_ERS_RESULT_NEED_RESET;
  44                break;
  45        default:
  46                break;
  47        }
  48
  49        return orig;
  50}
  51
  52static int report_error_detected(struct pci_dev *dev, void *data)
  53{
  54        pci_ers_result_t vote;
  55        const struct pci_error_handlers *err_handler;
  56        struct aer_broadcast_data *result_data;
  57
  58        result_data = (struct aer_broadcast_data *) data;
  59
  60        device_lock(&dev->dev);
  61        dev->error_state = result_data->state;
  62
  63        if (!dev->driver ||
  64                !dev->driver->err_handler ||
  65                !dev->driver->err_handler->error_detected) {
  66                if (result_data->state == pci_channel_io_frozen &&
  67                        dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
  68                        /*
  69                         * In case of fatal recovery, if one of down-
  70                         * stream device has no driver. We might be
  71                         * unable to recover because a later insmod
  72                         * of a driver for this device is unaware of
  73                         * its hw state.
  74                         */
  75                        pci_printk(KERN_DEBUG, dev, "device has %s\n",
  76                                   dev->driver ?
  77                                   "no AER-aware driver" : "no driver");
  78                }
  79
  80                /*
  81                 * If there's any device in the subtree that does not
  82                 * have an error_detected callback, returning
  83                 * PCI_ERS_RESULT_NO_AER_DRIVER prevents calling of
  84                 * the subsequent mmio_enabled/slot_reset/resume
  85                 * callbacks of "any" device in the subtree. All the
  86                 * devices in the subtree are left in the error state
  87                 * without recovery.
  88                 */
  89
  90                if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE)
  91                        vote = PCI_ERS_RESULT_NO_AER_DRIVER;
  92                else
  93                        vote = PCI_ERS_RESULT_NONE;
  94        } else {
  95                err_handler = dev->driver->err_handler;
  96                vote = err_handler->error_detected(dev, result_data->state);
  97                pci_uevent_ers(dev, PCI_ERS_RESULT_NONE);
  98        }
  99
 100        result_data->result = merge_result(result_data->result, vote);
 101        device_unlock(&dev->dev);
 102        return 0;
 103}
 104
 105static int report_mmio_enabled(struct pci_dev *dev, void *data)
 106{
 107        pci_ers_result_t vote;
 108        const struct pci_error_handlers *err_handler;
 109        struct aer_broadcast_data *result_data;
 110
 111        result_data = (struct aer_broadcast_data *) data;
 112
 113        device_lock(&dev->dev);
 114        if (!dev->driver ||
 115                !dev->driver->err_handler ||
 116                !dev->driver->err_handler->mmio_enabled)
 117                goto out;
 118
 119        err_handler = dev->driver->err_handler;
 120        vote = err_handler->mmio_enabled(dev);
 121        result_data->result = merge_result(result_data->result, vote);
 122out:
 123        device_unlock(&dev->dev);
 124        return 0;
 125}
 126
 127static int report_slot_reset(struct pci_dev *dev, void *data)
 128{
 129        pci_ers_result_t vote;
 130        const struct pci_error_handlers *err_handler;
 131        struct aer_broadcast_data *result_data;
 132
 133        result_data = (struct aer_broadcast_data *) data;
 134
 135        device_lock(&dev->dev);
 136        if (!dev->driver ||
 137                !dev->driver->err_handler ||
 138                !dev->driver->err_handler->slot_reset)
 139                goto out;
 140
 141        err_handler = dev->driver->err_handler;
 142        vote = err_handler->slot_reset(dev);
 143        result_data->result = merge_result(result_data->result, vote);
 144out:
 145        device_unlock(&dev->dev);
 146        return 0;
 147}
 148
 149static int report_resume(struct pci_dev *dev, void *data)
 150{
 151        const struct pci_error_handlers *err_handler;
 152
 153        device_lock(&dev->dev);
 154        dev->error_state = pci_channel_io_normal;
 155
 156        if (!dev->driver ||
 157                !dev->driver->err_handler ||
 158                !dev->driver->err_handler->resume)
 159                goto out;
 160
 161        err_handler = dev->driver->err_handler;
 162        err_handler->resume(dev);
 163        pci_uevent_ers(dev, PCI_ERS_RESULT_RECOVERED);
 164out:
 165        device_unlock(&dev->dev);
 166        return 0;
 167}
 168
 169/**
 170 * default_reset_link - default reset function
 171 * @dev: pointer to pci_dev data structure
 172 *
 173 * Invoked when performing link reset on a Downstream Port or a
 174 * Root Port with no aer driver.
 175 */
 176static pci_ers_result_t default_reset_link(struct pci_dev *dev)
 177{
 178        int rc;
 179
 180        rc = pci_bridge_secondary_bus_reset(dev);
 181        pci_printk(KERN_DEBUG, dev, "downstream link has been reset\n");
 182        return rc ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
 183}
 184
 185static pci_ers_result_t reset_link(struct pci_dev *dev, u32 service)
 186{
 187        struct pci_dev *udev;
 188        pci_ers_result_t status;
 189        struct pcie_port_service_driver *driver = NULL;
 190
 191        if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
 192                /* Reset this port for all subordinates */
 193                udev = dev;
 194        } else {
 195                /* Reset the upstream component (likely downstream port) */
 196                udev = dev->bus->self;
 197        }
 198
 199        /* Use the aer driver of the component firstly */
 200        driver = pcie_port_find_service(udev, service);
 201
 202        if (driver && driver->reset_link) {
 203                status = driver->reset_link(udev);
 204        } else if (udev->has_secondary_link) {
 205                status = default_reset_link(udev);
 206        } else {
 207                pci_printk(KERN_DEBUG, dev, "no link-reset support at upstream device %s\n",
 208                        pci_name(udev));
 209                return PCI_ERS_RESULT_DISCONNECT;
 210        }
 211
 212        if (status != PCI_ERS_RESULT_RECOVERED) {
 213                pci_printk(KERN_DEBUG, dev, "link reset at upstream device %s failed\n",
 214                        pci_name(udev));
 215                return PCI_ERS_RESULT_DISCONNECT;
 216        }
 217
 218        return status;
 219}
 220
 221/**
 222 * broadcast_error_message - handle message broadcast to downstream drivers
 223 * @dev: pointer to from where in a hierarchy message is broadcasted down
 224 * @state: error state
 225 * @error_mesg: message to print
 226 * @cb: callback to be broadcasted
 227 *
 228 * Invoked during error recovery process. Once being invoked, the content
 229 * of error severity will be broadcasted to all downstream drivers in a
 230 * hierarchy in question.
 231 */
 232static pci_ers_result_t broadcast_error_message(struct pci_dev *dev,
 233        enum pci_channel_state state,
 234        char *error_mesg,
 235        int (*cb)(struct pci_dev *, void *))
 236{
 237        struct aer_broadcast_data result_data;
 238
 239        pci_printk(KERN_DEBUG, dev, "broadcast %s message\n", error_mesg);
 240        result_data.state = state;
 241        if (cb == report_error_detected)
 242                result_data.result = PCI_ERS_RESULT_CAN_RECOVER;
 243        else
 244                result_data.result = PCI_ERS_RESULT_RECOVERED;
 245
 246        if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
 247                /*
 248                 * If the error is reported by a bridge, we think this error
 249                 * is related to the downstream link of the bridge, so we
 250                 * do error recovery on all subordinates of the bridge instead
 251                 * of the bridge and clear the error status of the bridge.
 252                 */
 253                if (cb == report_error_detected)
 254                        dev->error_state = state;
 255                pci_walk_bus(dev->subordinate, cb, &result_data);
 256                if (cb == report_resume) {
 257                        pci_aer_clear_device_status(dev);
 258                        pci_cleanup_aer_uncorrect_error_status(dev);
 259                        dev->error_state = pci_channel_io_normal;
 260                }
 261        } else {
 262                /*
 263                 * If the error is reported by an end point, we think this
 264                 * error is related to the upstream link of the end point.
 265                 * The error is non fatal so the bus is ok; just invoke
 266                 * the callback for the function that logged the error.
 267                 */
 268                cb(dev, &result_data);
 269        }
 270
 271        return result_data.result;
 272}
 273
 274/**
 275 * pcie_do_fatal_recovery - handle fatal error recovery process
 276 * @dev: pointer to a pci_dev data structure of agent detecting an error
 277 *
 278 * Invoked when an error is fatal. Once being invoked, removes the devices
 279 * beneath this AER agent, followed by reset link e.g. secondary bus reset
 280 * followed by re-enumeration of devices.
 281 */
 282void pcie_do_fatal_recovery(struct pci_dev *dev, u32 service)
 283{
 284        struct pci_dev *udev;
 285        struct pci_bus *parent;
 286        struct pci_dev *pdev, *temp;
 287        pci_ers_result_t result;
 288
 289        if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)
 290                udev = dev;
 291        else
 292                udev = dev->bus->self;
 293
 294        parent = udev->subordinate;
 295        pci_walk_bus(parent, pci_dev_set_disconnected, NULL);
 296
 297        pci_lock_rescan_remove();
 298        pci_dev_get(dev);
 299        list_for_each_entry_safe_reverse(pdev, temp, &parent->devices,
 300                                         bus_list) {
 301                pci_stop_and_remove_bus_device(pdev);
 302        }
 303
 304        result = reset_link(udev, service);
 305
 306        if ((service == PCIE_PORT_SERVICE_AER) &&
 307            (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) {
 308                /*
 309                 * If the error is reported by a bridge, we think this error
 310                 * is related to the downstream link of the bridge, so we
 311                 * do error recovery on all subordinates of the bridge instead
 312                 * of the bridge and clear the error status of the bridge.
 313                 */
 314                pci_aer_clear_fatal_status(dev);
 315                pci_aer_clear_device_status(dev);
 316        }
 317
 318        if (result == PCI_ERS_RESULT_RECOVERED) {
 319                if (pcie_wait_for_link(udev, true))
 320                        pci_rescan_bus(udev->bus);
 321                pci_info(dev, "Device recovery from fatal error successful\n");
 322        } else {
 323                pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
 324                pci_info(dev, "Device recovery from fatal error failed\n");
 325        }
 326
 327        pci_dev_put(dev);
 328        pci_unlock_rescan_remove();
 329}
 330
 331/**
 332 * pcie_do_nonfatal_recovery - handle nonfatal error recovery process
 333 * @dev: pointer to a pci_dev data structure of agent detecting an error
 334 *
 335 * Invoked when an error is nonfatal/fatal. Once being invoked, broadcast
 336 * error detected message to all downstream drivers within a hierarchy in
 337 * question and return the returned code.
 338 */
 339void pcie_do_nonfatal_recovery(struct pci_dev *dev)
 340{
 341        pci_ers_result_t status;
 342        enum pci_channel_state state;
 343
 344        state = pci_channel_io_normal;
 345
 346        status = broadcast_error_message(dev,
 347                        state,
 348                        "error_detected",
 349                        report_error_detected);
 350
 351        if (status == PCI_ERS_RESULT_CAN_RECOVER)
 352                status = broadcast_error_message(dev,
 353                                state,
 354                                "mmio_enabled",
 355                                report_mmio_enabled);
 356
 357        if (status == PCI_ERS_RESULT_NEED_RESET) {
 358                /*
 359                 * TODO: Should call platform-specific
 360                 * functions to reset slot before calling
 361                 * drivers' slot_reset callbacks?
 362                 */
 363                status = broadcast_error_message(dev,
 364                                state,
 365                                "slot_reset",
 366                                report_slot_reset);
 367        }
 368
 369        if (status != PCI_ERS_RESULT_RECOVERED)
 370                goto failed;
 371
 372        broadcast_error_message(dev,
 373                                state,
 374                                "resume",
 375                                report_resume);
 376
 377        pci_info(dev, "AER: Device recovery successful\n");
 378        return;
 379
 380failed:
 381        pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT);
 382
 383        /* TODO: Should kernel panic here? */
 384        pci_info(dev, "AER: Device recovery failed\n");
 385}
 386