linux/drivers/infiniband/hw/ipath/ipath_driver.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
   3 * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
   4 *
   5 * This software is available to you under a choice of one of two
   6 * licenses.  You may choose to be licensed under the terms of the GNU
   7 * General Public License (GPL) Version 2, available from the file
   8 * COPYING in the main directory of this source tree, or the
   9 * OpenIB.org BSD license below:
  10 *
  11 *     Redistribution and use in source and binary forms, with or
  12 *     without modification, are permitted provided that the following
  13 *     conditions are met:
  14 *
  15 *      - Redistributions of source code must retain the above
  16 *        copyright notice, this list of conditions and the following
  17 *        disclaimer.
  18 *
  19 *      - Redistributions in binary form must reproduce the above
  20 *        copyright notice, this list of conditions and the following
  21 *        disclaimer in the documentation and/or other materials
  22 *        provided with the distribution.
  23 *
  24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  31 * SOFTWARE.
  32 */
  33
  34#include <linux/sched.h>
  35#include <linux/spinlock.h>
  36#include <linux/idr.h>
  37#include <linux/pci.h>
  38#include <linux/io.h>
  39#include <linux/delay.h>
  40#include <linux/netdevice.h>
  41#include <linux/vmalloc.h>
  42#include <linux/bitmap.h>
  43#include <linux/slab.h>
  44#include <linux/module.h>
  45
  46#include "ipath_kernel.h"
  47#include "ipath_verbs.h"
  48
  49static void ipath_update_pio_bufs(struct ipath_devdata *);
  50
  51const char *ipath_get_unit_name(int unit)
  52{
  53        static char iname[16];
  54        snprintf(iname, sizeof iname, "infinipath%u", unit);
  55        return iname;
  56}
  57
  58#define DRIVER_LOAD_MSG "QLogic " IPATH_DRV_NAME " loaded: "
  59#define PFX IPATH_DRV_NAME ": "
  60
  61/*
  62 * The size has to be longer than this string, so we can append
  63 * board/chip information to it in the init code.
  64 */
  65const char ib_ipath_version[] = IPATH_IDSTR "\n";
  66
  67static struct idr unit_table;
  68DEFINE_SPINLOCK(ipath_devs_lock);
  69LIST_HEAD(ipath_dev_list);
  70
  71wait_queue_head_t ipath_state_wait;
  72
  73unsigned ipath_debug = __IPATH_INFO;
  74
  75module_param_named(debug, ipath_debug, uint, S_IWUSR | S_IRUGO);
  76MODULE_PARM_DESC(debug, "mask for debug prints");
  77EXPORT_SYMBOL_GPL(ipath_debug);
  78
  79unsigned ipath_mtu4096 = 1; /* max 4KB IB mtu by default, if supported */
  80module_param_named(mtu4096, ipath_mtu4096, uint, S_IRUGO);
  81MODULE_PARM_DESC(mtu4096, "enable MTU of 4096 bytes, if supported");
  82
  83static unsigned ipath_hol_timeout_ms = 13000;
  84module_param_named(hol_timeout_ms, ipath_hol_timeout_ms, uint, S_IRUGO);
  85MODULE_PARM_DESC(hol_timeout_ms,
  86        "duration of user app suspension after link failure");
  87
  88unsigned ipath_linkrecovery = 1;
  89module_param_named(linkrecovery, ipath_linkrecovery, uint, S_IWUSR | S_IRUGO);
  90MODULE_PARM_DESC(linkrecovery, "enable workaround for link recovery issue");
  91
  92MODULE_LICENSE("GPL");
  93MODULE_AUTHOR("QLogic <support@qlogic.com>");
  94MODULE_DESCRIPTION("QLogic InfiniPath driver");
  95
  96/*
  97 * Table to translate the LINKTRAININGSTATE portion of
  98 * IBCStatus to a human-readable form.
  99 */
 100const char *ipath_ibcstatus_str[] = {
 101        "Disabled",
 102        "LinkUp",
 103        "PollActive",
 104        "PollQuiet",
 105        "SleepDelay",
 106        "SleepQuiet",
 107        "LState6",              /* unused */
 108        "LState7",              /* unused */
 109        "CfgDebounce",
 110        "CfgRcvfCfg",
 111        "CfgWaitRmt",
 112        "CfgIdle",
 113        "RecovRetrain",
 114        "CfgTxRevLane",         /* unused before IBA7220 */
 115        "RecovWaitRmt",
 116        "RecovIdle",
 117        /* below were added for IBA7220 */
 118        "CfgEnhanced",
 119        "CfgTest",
 120        "CfgWaitRmtTest",
 121        "CfgWaitCfgEnhanced",
 122        "SendTS_T",
 123        "SendTstIdles",
 124        "RcvTS_T",
 125        "SendTst_TS1s",
 126        "LTState18", "LTState19", "LTState1A", "LTState1B",
 127        "LTState1C", "LTState1D", "LTState1E", "LTState1F"
 128};
 129
 130static void __devexit ipath_remove_one(struct pci_dev *);
 131static int __devinit ipath_init_one(struct pci_dev *,
 132                                    const struct pci_device_id *);
 133
 134/* Only needed for registration, nothing else needs this info */
 135#define PCI_VENDOR_ID_PATHSCALE 0x1fc1
 136#define PCI_DEVICE_ID_INFINIPATH_HT 0xd
 137
 138/* Number of seconds before our card status check...  */
 139#define STATUS_TIMEOUT 60
 140
 141static const struct pci_device_id ipath_pci_tbl[] = {
 142        { PCI_DEVICE(PCI_VENDOR_ID_PATHSCALE, PCI_DEVICE_ID_INFINIPATH_HT) },
 143        { 0, }
 144};
 145
 146MODULE_DEVICE_TABLE(pci, ipath_pci_tbl);
 147
 148static struct pci_driver ipath_driver = {
 149        .name = IPATH_DRV_NAME,
 150        .probe = ipath_init_one,
 151        .remove = __devexit_p(ipath_remove_one),
 152        .id_table = ipath_pci_tbl,
 153        .driver = {
 154                .groups = ipath_driver_attr_groups,
 155        },
 156};
 157
 158static inline void read_bars(struct ipath_devdata *dd, struct pci_dev *dev,
 159                             u32 *bar0, u32 *bar1)
 160{
 161        int ret;
 162
 163        ret = pci_read_config_dword(dev, PCI_BASE_ADDRESS_0, bar0);
 164        if (ret)
 165                ipath_dev_err(dd, "failed to read bar0 before enable: "
 166                              "error %d\n", -ret);
 167
 168        ret = pci_read_config_dword(dev, PCI_BASE_ADDRESS_1, bar1);
 169        if (ret)
 170                ipath_dev_err(dd, "failed to read bar1 before enable: "
 171                              "error %d\n", -ret);
 172
 173        ipath_dbg("Read bar0 %x bar1 %x\n", *bar0, *bar1);
 174}
 175
 176static void ipath_free_devdata(struct pci_dev *pdev,
 177                               struct ipath_devdata *dd)
 178{
 179        unsigned long flags;
 180
 181        pci_set_drvdata(pdev, NULL);
 182
 183        if (dd->ipath_unit != -1) {
 184                spin_lock_irqsave(&ipath_devs_lock, flags);
 185                idr_remove(&unit_table, dd->ipath_unit);
 186                list_del(&dd->ipath_list);
 187                spin_unlock_irqrestore(&ipath_devs_lock, flags);
 188        }
 189        vfree(dd);
 190}
 191
 192static struct ipath_devdata *ipath_alloc_devdata(struct pci_dev *pdev)
 193{
 194        unsigned long flags;
 195        struct ipath_devdata *dd;
 196        int ret;
 197
 198        if (!idr_pre_get(&unit_table, GFP_KERNEL)) {
 199                dd = ERR_PTR(-ENOMEM);
 200                goto bail;
 201        }
 202
 203        dd = vzalloc(sizeof(*dd));
 204        if (!dd) {
 205                dd = ERR_PTR(-ENOMEM);
 206                goto bail;
 207        }
 208        dd->ipath_unit = -1;
 209
 210        spin_lock_irqsave(&ipath_devs_lock, flags);
 211
 212        ret = idr_get_new(&unit_table, dd, &dd->ipath_unit);
 213        if (ret < 0) {
 214                printk(KERN_ERR IPATH_DRV_NAME
 215                       ": Could not allocate unit ID: error %d\n", -ret);
 216                ipath_free_devdata(pdev, dd);
 217                dd = ERR_PTR(ret);
 218                goto bail_unlock;
 219        }
 220
 221        dd->pcidev = pdev;
 222        pci_set_drvdata(pdev, dd);
 223
 224        list_add(&dd->ipath_list, &ipath_dev_list);
 225
 226bail_unlock:
 227        spin_unlock_irqrestore(&ipath_devs_lock, flags);
 228
 229bail:
 230        return dd;
 231}
 232
 233static inline struct ipath_devdata *__ipath_lookup(int unit)
 234{
 235        return idr_find(&unit_table, unit);
 236}
 237
 238struct ipath_devdata *ipath_lookup(int unit)
 239{
 240        struct ipath_devdata *dd;
 241        unsigned long flags;
 242
 243        spin_lock_irqsave(&ipath_devs_lock, flags);
 244        dd = __ipath_lookup(unit);
 245        spin_unlock_irqrestore(&ipath_devs_lock, flags);
 246
 247        return dd;
 248}
 249
 250int ipath_count_units(int *npresentp, int *nupp, int *maxportsp)
 251{
 252        int nunits, npresent, nup;
 253        struct ipath_devdata *dd;
 254        unsigned long flags;
 255        int maxports;
 256
 257        nunits = npresent = nup = maxports = 0;
 258
 259        spin_lock_irqsave(&ipath_devs_lock, flags);
 260
 261        list_for_each_entry(dd, &ipath_dev_list, ipath_list) {
 262                nunits++;
 263                if ((dd->ipath_flags & IPATH_PRESENT) && dd->ipath_kregbase)
 264                        npresent++;
 265                if (dd->ipath_lid &&
 266                    !(dd->ipath_flags & (IPATH_DISABLED | IPATH_LINKDOWN
 267                                         | IPATH_LINKUNK)))
 268                        nup++;
 269                if (dd->ipath_cfgports > maxports)
 270                        maxports = dd->ipath_cfgports;
 271        }
 272
 273        spin_unlock_irqrestore(&ipath_devs_lock, flags);
 274
 275        if (npresentp)
 276                *npresentp = npresent;
 277        if (nupp)
 278                *nupp = nup;
 279        if (maxportsp)
 280                *maxportsp = maxports;
 281
 282        return nunits;
 283}
 284
 285/*
 286 * These next two routines are placeholders in case we don't have per-arch
 287 * code for controlling write combining.  If explicit control of write
 288 * combining is not available, performance will probably be awful.
 289 */
 290
 291int __attribute__((weak)) ipath_enable_wc(struct ipath_devdata *dd)
 292{
 293        return -EOPNOTSUPP;
 294}
 295
 296void __attribute__((weak)) ipath_disable_wc(struct ipath_devdata *dd)
 297{
 298}
 299
 300/*
 301 * Perform a PIO buffer bandwidth write test, to verify proper system
 302 * configuration.  Even when all the setup calls work, occasionally
 303 * BIOS or other issues can prevent write combining from working, or
 304 * can cause other bandwidth problems to the chip.
 305 *
 306 * This test simply writes the same buffer over and over again, and
 307 * measures close to the peak bandwidth to the chip (not testing
 308 * data bandwidth to the wire).   On chips that use an address-based
 309 * trigger to send packets to the wire, this is easy.  On chips that
 310 * use a count to trigger, we want to make sure that the packet doesn't
 311 * go out on the wire, or trigger flow control checks.
 312 */
 313static void ipath_verify_pioperf(struct ipath_devdata *dd)
 314{
 315        u32 pbnum, cnt, lcnt;
 316        u32 __iomem *piobuf;
 317        u32 *addr;
 318        u64 msecs, emsecs;
 319
 320        piobuf = ipath_getpiobuf(dd, 0, &pbnum);
 321        if (!piobuf) {
 322                dev_info(&dd->pcidev->dev,
 323                        "No PIObufs for checking perf, skipping\n");
 324                return;
 325        }
 326
 327        /*
 328         * Enough to give us a reasonable test, less than piobuf size, and
 329         * likely multiple of store buffer length.
 330         */
 331        cnt = 1024;
 332
 333        addr = vmalloc(cnt);
 334        if (!addr) {
 335                dev_info(&dd->pcidev->dev,
 336                        "Couldn't get memory for checking PIO perf,"
 337                        " skipping\n");
 338                goto done;
 339        }
 340
 341        preempt_disable();  /* we want reasonably accurate elapsed time */
 342        msecs = 1 + jiffies_to_msecs(jiffies);
 343        for (lcnt = 0; lcnt < 10000U; lcnt++) {
 344                /* wait until we cross msec boundary */
 345                if (jiffies_to_msecs(jiffies) >= msecs)
 346                        break;
 347                udelay(1);
 348        }
 349
 350        ipath_disable_armlaunch(dd);
 351
 352        /*
 353         * length 0, no dwords actually sent, and mark as VL15
 354         * on chips where that may matter (due to IB flowcontrol)
 355         */
 356        if ((dd->ipath_flags & IPATH_HAS_PBC_CNT))
 357                writeq(1UL << 63, piobuf);
 358        else
 359                writeq(0, piobuf);
 360        ipath_flush_wc();
 361
 362        /*
 363         * this is only roughly accurate, since even with preempt we
 364         * still take interrupts that could take a while.   Running for
 365         * >= 5 msec seems to get us "close enough" to accurate values
 366         */
 367        msecs = jiffies_to_msecs(jiffies);
 368        for (emsecs = lcnt = 0; emsecs <= 5UL; lcnt++) {
 369                __iowrite32_copy(piobuf + 64, addr, cnt >> 2);
 370                emsecs = jiffies_to_msecs(jiffies) - msecs;
 371        }
 372
 373        /* 1 GiB/sec, slightly over IB SDR line rate */
 374        if (lcnt < (emsecs * 1024U))
 375                ipath_dev_err(dd,
 376                        "Performance problem: bandwidth to PIO buffers is "
 377                        "only %u MiB/sec\n",
 378                        lcnt / (u32) emsecs);
 379        else
 380                ipath_dbg("PIO buffer bandwidth %u MiB/sec is OK\n",
 381                        lcnt / (u32) emsecs);
 382
 383        preempt_enable();
 384
 385        vfree(addr);
 386
 387done:
 388        /* disarm piobuf, so it's available again */
 389        ipath_disarm_piobufs(dd, pbnum, 1);
 390        ipath_enable_armlaunch(dd);
 391}
 392
 393static void cleanup_device(struct ipath_devdata *dd);
 394
 395static int __devinit ipath_init_one(struct pci_dev *pdev,
 396                                    const struct pci_device_id *ent)
 397{
 398        int ret, len, j;
 399        struct ipath_devdata *dd;
 400        unsigned long long addr;
 401        u32 bar0 = 0, bar1 = 0;
 402
 403        dd = ipath_alloc_devdata(pdev);
 404        if (IS_ERR(dd)) {
 405                ret = PTR_ERR(dd);
 406                printk(KERN_ERR IPATH_DRV_NAME
 407                       ": Could not allocate devdata: error %d\n", -ret);
 408                goto bail;
 409        }
 410
 411        ipath_cdbg(VERBOSE, "initializing unit #%u\n", dd->ipath_unit);
 412
 413        ret = pci_enable_device(pdev);
 414        if (ret) {
 415                /* This can happen iff:
 416                 *
 417                 * We did a chip reset, and then failed to reprogram the
 418                 * BAR, or the chip reset due to an internal error.  We then
 419                 * unloaded the driver and reloaded it.
 420                 *
 421                 * Both reset cases set the BAR back to initial state.  For
 422                 * the latter case, the AER sticky error bit at offset 0x718
 423                 * should be set, but the Linux kernel doesn't yet know
 424                 * about that, it appears.  If the original BAR was retained
 425                 * in the kernel data structures, this may be OK.
 426                 */
 427                ipath_dev_err(dd, "enable unit %d failed: error %d\n",
 428                              dd->ipath_unit, -ret);
 429                goto bail_devdata;
 430        }
 431        addr = pci_resource_start(pdev, 0);
 432        len = pci_resource_len(pdev, 0);
 433        ipath_cdbg(VERBOSE, "regbase (0) %llx len %d irq %d, vend %x/%x "
 434                   "driver_data %lx\n", addr, len, pdev->irq, ent->vendor,
 435                   ent->device, ent->driver_data);
 436
 437        read_bars(dd, pdev, &bar0, &bar1);
 438
 439        if (!bar1 && !(bar0 & ~0xf)) {
 440                if (addr) {
 441                        dev_info(&pdev->dev, "BAR is 0 (probable RESET), "
 442                                 "rewriting as %llx\n", addr);
 443                        ret = pci_write_config_dword(
 444                                pdev, PCI_BASE_ADDRESS_0, addr);
 445                        if (ret) {
 446                                ipath_dev_err(dd, "rewrite of BAR0 "
 447                                              "failed: err %d\n", -ret);
 448                                goto bail_disable;
 449                        }
 450                        ret = pci_write_config_dword(
 451                                pdev, PCI_BASE_ADDRESS_1, addr >> 32);
 452                        if (ret) {
 453                                ipath_dev_err(dd, "rewrite of BAR1 "
 454                                              "failed: err %d\n", -ret);
 455                                goto bail_disable;
 456                        }
 457                } else {
 458                        ipath_dev_err(dd, "BAR is 0 (probable RESET), "
 459                                      "not usable until reboot\n");
 460                        ret = -ENODEV;
 461                        goto bail_disable;
 462                }
 463        }
 464
 465        ret = pci_request_regions(pdev, IPATH_DRV_NAME);
 466        if (ret) {
 467                dev_info(&pdev->dev, "pci_request_regions unit %u fails: "
 468                         "err %d\n", dd->ipath_unit, -ret);
 469                goto bail_disable;
 470        }
 471
 472        ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
 473        if (ret) {
 474                /*
 475                 * if the 64 bit setup fails, try 32 bit.  Some systems
 476                 * do not setup 64 bit maps on systems with 2GB or less
 477                 * memory installed.
 478                 */
 479                ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 480                if (ret) {
 481                        dev_info(&pdev->dev,
 482                                "Unable to set DMA mask for unit %u: %d\n",
 483                                dd->ipath_unit, ret);
 484                        goto bail_regions;
 485                }
 486                else {
 487                        ipath_dbg("No 64bit DMA mask, used 32 bit mask\n");
 488                        ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
 489                        if (ret)
 490                                dev_info(&pdev->dev,
 491                                        "Unable to set DMA consistent mask "
 492                                        "for unit %u: %d\n",
 493                                        dd->ipath_unit, ret);
 494
 495                }
 496        }
 497        else {
 498                ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
 499                if (ret)
 500                        dev_info(&pdev->dev,
 501                                "Unable to set DMA consistent mask "
 502                                "for unit %u: %d\n",
 503                                dd->ipath_unit, ret);
 504        }
 505
 506        pci_set_master(pdev);
 507
 508        /*
 509         * Save BARs to rewrite after device reset.  Save all 64 bits of
 510         * BAR, just in case.
 511         */
 512        dd->ipath_pcibar0 = addr;
 513        dd->ipath_pcibar1 = addr >> 32;
 514        dd->ipath_deviceid = ent->device;       /* save for later use */
 515        dd->ipath_vendorid = ent->vendor;
 516
 517        /* setup the chip-specific functions, as early as possible. */
 518        switch (ent->device) {
 519        case PCI_DEVICE_ID_INFINIPATH_HT:
 520                ipath_init_iba6110_funcs(dd);
 521                break;
 522
 523        default:
 524                ipath_dev_err(dd, "Found unknown QLogic deviceid 0x%x, "
 525                              "failing\n", ent->device);
 526                return -ENODEV;
 527        }
 528
 529        for (j = 0; j < 6; j++) {
 530                if (!pdev->resource[j].start)
 531                        continue;
 532                ipath_cdbg(VERBOSE, "BAR %d %pR, len %llx\n",
 533                           j, &pdev->resource[j],
 534                           (unsigned long long)pci_resource_len(pdev, j));
 535        }
 536
 537        if (!addr) {
 538                ipath_dev_err(dd, "No valid address in BAR 0!\n");
 539                ret = -ENODEV;
 540                goto bail_regions;
 541        }
 542
 543        dd->ipath_pcirev = pdev->revision;
 544
 545#if defined(__powerpc__)
 546        /* There isn't a generic way to specify writethrough mappings */
 547        dd->ipath_kregbase = __ioremap(addr, len,
 548                (_PAGE_NO_CACHE|_PAGE_WRITETHRU));
 549#else
 550        dd->ipath_kregbase = ioremap_nocache(addr, len);
 551#endif
 552
 553        if (!dd->ipath_kregbase) {
 554                ipath_dbg("Unable to map io addr %llx to kvirt, failing\n",
 555                          addr);
 556                ret = -ENOMEM;
 557                goto bail_iounmap;
 558        }
 559        dd->ipath_kregend = (u64 __iomem *)
 560                ((void __iomem *)dd->ipath_kregbase + len);
 561        dd->ipath_physaddr = addr;      /* used for io_remap, etc. */
 562        /* for user mmap */
 563        ipath_cdbg(VERBOSE, "mapped io addr %llx to kregbase %p\n",
 564                   addr, dd->ipath_kregbase);
 565
 566        if (dd->ipath_f_bus(dd, pdev))
 567                ipath_dev_err(dd, "Failed to setup config space; "
 568                              "continuing anyway\n");
 569
 570        /*
 571         * set up our interrupt handler; IRQF_SHARED probably not needed,
 572         * since MSI interrupts shouldn't be shared but won't  hurt for now.
 573         * check 0 irq after we return from chip-specific bus setup, since
 574         * that can affect this due to setup
 575         */
 576        if (!dd->ipath_irq)
 577                ipath_dev_err(dd, "irq is 0, BIOS error?  Interrupts won't "
 578                              "work\n");
 579        else {
 580                ret = request_irq(dd->ipath_irq, ipath_intr, IRQF_SHARED,
 581                                  IPATH_DRV_NAME, dd);
 582                if (ret) {
 583                        ipath_dev_err(dd, "Couldn't setup irq handler, "
 584                                      "irq=%d: %d\n", dd->ipath_irq, ret);
 585                        goto bail_iounmap;
 586                }
 587        }
 588
 589        ret = ipath_init_chip(dd, 0);   /* do the chip-specific init */
 590        if (ret)
 591                goto bail_irqsetup;
 592
 593        ret = ipath_enable_wc(dd);
 594
 595        if (ret) {
 596                ipath_dev_err(dd, "Write combining not enabled "
 597                              "(err %d): performance may be poor\n",
 598                              -ret);
 599                ret = 0;
 600        }
 601
 602        ipath_verify_pioperf(dd);
 603
 604        ipath_device_create_group(&pdev->dev, dd);
 605        ipathfs_add_device(dd);
 606        ipath_user_add(dd);
 607        ipath_diag_add(dd);
 608        ipath_register_ib_device(dd);
 609
 610        goto bail;
 611
 612bail_irqsetup:
 613        cleanup_device(dd);
 614
 615        if (dd->ipath_irq)
 616                dd->ipath_f_free_irq(dd);
 617
 618        if (dd->ipath_f_cleanup)
 619                dd->ipath_f_cleanup(dd);
 620
 621bail_iounmap:
 622        iounmap((volatile void __iomem *) dd->ipath_kregbase);
 623
 624bail_regions:
 625        pci_release_regions(pdev);
 626
 627bail_disable:
 628        pci_disable_device(pdev);
 629
 630bail_devdata:
 631        ipath_free_devdata(pdev, dd);
 632
 633bail:
 634        return ret;
 635}
 636
 637static void cleanup_device(struct ipath_devdata *dd)
 638{
 639        int port;
 640        struct ipath_portdata **tmp;
 641        unsigned long flags;
 642
 643        if (*dd->ipath_statusp & IPATH_STATUS_CHIP_PRESENT) {
 644                /* can't do anything more with chip; needs re-init */
 645                *dd->ipath_statusp &= ~IPATH_STATUS_CHIP_PRESENT;
 646                if (dd->ipath_kregbase) {
 647                        /*
 648                         * if we haven't already cleaned up before these are
 649                         * to ensure any register reads/writes "fail" until
 650                         * re-init
 651                         */
 652                        dd->ipath_kregbase = NULL;
 653                        dd->ipath_uregbase = 0;
 654                        dd->ipath_sregbase = 0;
 655                        dd->ipath_cregbase = 0;
 656                        dd->ipath_kregsize = 0;
 657                }
 658                ipath_disable_wc(dd);
 659        }
 660
 661        if (dd->ipath_spectriggerhit)
 662                dev_info(&dd->pcidev->dev, "%lu special trigger hits\n",
 663                         dd->ipath_spectriggerhit);
 664
 665        if (dd->ipath_pioavailregs_dma) {
 666                dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
 667                                  (void *) dd->ipath_pioavailregs_dma,
 668                                  dd->ipath_pioavailregs_phys);
 669                dd->ipath_pioavailregs_dma = NULL;
 670        }
 671        if (dd->ipath_dummy_hdrq) {
 672                dma_free_coherent(&dd->pcidev->dev,
 673                        dd->ipath_pd[0]->port_rcvhdrq_size,
 674                        dd->ipath_dummy_hdrq, dd->ipath_dummy_hdrq_phys);
 675                dd->ipath_dummy_hdrq = NULL;
 676        }
 677
 678        if (dd->ipath_pageshadow) {
 679                struct page **tmpp = dd->ipath_pageshadow;
 680                dma_addr_t *tmpd = dd->ipath_physshadow;
 681                int i, cnt = 0;
 682
 683                ipath_cdbg(VERBOSE, "Unlocking any expTID pages still "
 684                           "locked\n");
 685                for (port = 0; port < dd->ipath_cfgports; port++) {
 686                        int port_tidbase = port * dd->ipath_rcvtidcnt;
 687                        int maxtid = port_tidbase + dd->ipath_rcvtidcnt;
 688                        for (i = port_tidbase; i < maxtid; i++) {
 689                                if (!tmpp[i])
 690                                        continue;
 691                                pci_unmap_page(dd->pcidev, tmpd[i],
 692                                        PAGE_SIZE, PCI_DMA_FROMDEVICE);
 693                                ipath_release_user_pages(&tmpp[i], 1);
 694                                tmpp[i] = NULL;
 695                                cnt++;
 696                        }
 697                }
 698                if (cnt) {
 699                        ipath_stats.sps_pageunlocks += cnt;
 700                        ipath_cdbg(VERBOSE, "There were still %u expTID "
 701                                   "entries locked\n", cnt);
 702                }
 703                if (ipath_stats.sps_pagelocks ||
 704                    ipath_stats.sps_pageunlocks)
 705                        ipath_cdbg(VERBOSE, "%llu pages locked, %llu "
 706                                   "unlocked via ipath_m{un}lock\n",
 707                                   (unsigned long long)
 708                                   ipath_stats.sps_pagelocks,
 709                                   (unsigned long long)
 710                                   ipath_stats.sps_pageunlocks);
 711
 712                ipath_cdbg(VERBOSE, "Free shadow page tid array at %p\n",
 713                           dd->ipath_pageshadow);
 714                tmpp = dd->ipath_pageshadow;
 715                dd->ipath_pageshadow = NULL;
 716                vfree(tmpp);
 717
 718                dd->ipath_egrtidbase = NULL;
 719        }
 720
 721        /*
 722         * free any resources still in use (usually just kernel ports)
 723         * at unload; we do for portcnt, because that's what we allocate.
 724         * We acquire lock to be really paranoid that ipath_pd isn't being
 725         * accessed from some interrupt-related code (that should not happen,
 726         * but best to be sure).
 727         */
 728        spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
 729        tmp = dd->ipath_pd;
 730        dd->ipath_pd = NULL;
 731        spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
 732        for (port = 0; port < dd->ipath_portcnt; port++) {
 733                struct ipath_portdata *pd = tmp[port];
 734                tmp[port] = NULL; /* debugging paranoia */
 735                ipath_free_pddata(dd, pd);
 736        }
 737        kfree(tmp);
 738}
 739
 740static void __devexit ipath_remove_one(struct pci_dev *pdev)
 741{
 742        struct ipath_devdata *dd = pci_get_drvdata(pdev);
 743
 744        ipath_cdbg(VERBOSE, "removing, pdev=%p, dd=%p\n", pdev, dd);
 745
 746        /*
 747         * disable the IB link early, to be sure no new packets arrive, which
 748         * complicates the shutdown process
 749         */
 750        ipath_shutdown_device(dd);
 751
 752        flush_workqueue(ib_wq);
 753
 754        if (dd->verbs_dev)
 755                ipath_unregister_ib_device(dd->verbs_dev);
 756
 757        ipath_diag_remove(dd);
 758        ipath_user_remove(dd);
 759        ipathfs_remove_device(dd);
 760        ipath_device_remove_group(&pdev->dev, dd);
 761
 762        ipath_cdbg(VERBOSE, "Releasing pci memory regions, dd %p, "
 763                   "unit %u\n", dd, (u32) dd->ipath_unit);
 764
 765        cleanup_device(dd);
 766
 767        /*
 768         * turn off rcv, send, and interrupts for all ports, all drivers
 769         * should also hard reset the chip here?
 770         * free up port 0 (kernel) rcvhdr, egr bufs, and eventually tid bufs
 771         * for all versions of the driver, if they were allocated
 772         */
 773        if (dd->ipath_irq) {
 774                ipath_cdbg(VERBOSE, "unit %u free irq %d\n",
 775                           dd->ipath_unit, dd->ipath_irq);
 776                dd->ipath_f_free_irq(dd);
 777        } else
 778                ipath_dbg("irq is 0, not doing free_irq "
 779                          "for unit %u\n", dd->ipath_unit);
 780        /*
 781         * we check for NULL here, because it's outside
 782         * the kregbase check, and we need to call it
 783         * after the free_irq.  Thus it's possible that
 784         * the function pointers were never initialized.
 785         */
 786        if (dd->ipath_f_cleanup)
 787                /* clean up chip-specific stuff */
 788                dd->ipath_f_cleanup(dd);
 789
 790        ipath_cdbg(VERBOSE, "Unmapping kregbase %p\n", dd->ipath_kregbase);
 791        iounmap((volatile void __iomem *) dd->ipath_kregbase);
 792        pci_release_regions(pdev);
 793        ipath_cdbg(VERBOSE, "calling pci_disable_device\n");
 794        pci_disable_device(pdev);
 795
 796        ipath_free_devdata(pdev, dd);
 797}
 798
 799/* general driver use */
 800DEFINE_MUTEX(ipath_mutex);
 801
 802static DEFINE_SPINLOCK(ipath_pioavail_lock);
 803
 804/**
 805 * ipath_disarm_piobufs - cancel a range of PIO buffers
 806 * @dd: the infinipath device
 807 * @first: the first PIO buffer to cancel
 808 * @cnt: the number of PIO buffers to cancel
 809 *
 810 * cancel a range of PIO buffers, used when they might be armed, but
 811 * not triggered.  Used at init to ensure buffer state, and also user
 812 * process close, in case it died while writing to a PIO buffer
 813 * Also after errors.
 814 */
 815void ipath_disarm_piobufs(struct ipath_devdata *dd, unsigned first,
 816                          unsigned cnt)
 817{
 818        unsigned i, last = first + cnt;
 819        unsigned long flags;
 820
 821        ipath_cdbg(PKT, "disarm %u PIObufs first=%u\n", cnt, first);
 822        for (i = first; i < last; i++) {
 823                spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
 824                /*
 825                 * The disarm-related bits are write-only, so it
 826                 * is ok to OR them in with our copy of sendctrl
 827                 * while we hold the lock.
 828                 */
 829                ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
 830                        dd->ipath_sendctrl | INFINIPATH_S_DISARM |
 831                        (i << INFINIPATH_S_DISARMPIOBUF_SHIFT));
 832                /* can't disarm bufs back-to-back per iba7220 spec */
 833                ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
 834                spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
 835        }
 836        /* on some older chips, update may not happen after cancel */
 837        ipath_force_pio_avail_update(dd);
 838}
 839
 840/**
 841 * ipath_wait_linkstate - wait for an IB link state change to occur
 842 * @dd: the infinipath device
 843 * @state: the state to wait for
 844 * @msecs: the number of milliseconds to wait
 845 *
 846 * wait up to msecs milliseconds for IB link state change to occur for
 847 * now, take the easy polling route.  Currently used only by
 848 * ipath_set_linkstate.  Returns 0 if state reached, otherwise
 849 * -ETIMEDOUT state can have multiple states set, for any of several
 850 * transitions.
 851 */
 852int ipath_wait_linkstate(struct ipath_devdata *dd, u32 state, int msecs)
 853{
 854        dd->ipath_state_wanted = state;
 855        wait_event_interruptible_timeout(ipath_state_wait,
 856                                         (dd->ipath_flags & state),
 857                                         msecs_to_jiffies(msecs));
 858        dd->ipath_state_wanted = 0;
 859
 860        if (!(dd->ipath_flags & state)) {
 861                u64 val;
 862                ipath_cdbg(VERBOSE, "Didn't reach linkstate %s within %u"
 863                           " ms\n",
 864                           /* test INIT ahead of DOWN, both can be set */
 865                           (state & IPATH_LINKINIT) ? "INIT" :
 866                           ((state & IPATH_LINKDOWN) ? "DOWN" :
 867                            ((state & IPATH_LINKARMED) ? "ARM" : "ACTIVE")),
 868                           msecs);
 869                val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
 870                ipath_cdbg(VERBOSE, "ibcc=%llx ibcstatus=%llx (%s)\n",
 871                           (unsigned long long) ipath_read_kreg64(
 872                                   dd, dd->ipath_kregs->kr_ibcctrl),
 873                           (unsigned long long) val,
 874                           ipath_ibcstatus_str[val & dd->ibcs_lts_mask]);
 875        }
 876        return (dd->ipath_flags & state) ? 0 : -ETIMEDOUT;
 877}
 878
 879static void decode_sdma_errs(struct ipath_devdata *dd, ipath_err_t err,
 880        char *buf, size_t blen)
 881{
 882        static const struct {
 883                ipath_err_t err;
 884                const char *msg;
 885        } errs[] = {
 886                { INFINIPATH_E_SDMAGENMISMATCH, "SDmaGenMismatch" },
 887                { INFINIPATH_E_SDMAOUTOFBOUND, "SDmaOutOfBound" },
 888                { INFINIPATH_E_SDMATAILOUTOFBOUND, "SDmaTailOutOfBound" },
 889                { INFINIPATH_E_SDMABASE, "SDmaBase" },
 890                { INFINIPATH_E_SDMA1STDESC, "SDma1stDesc" },
 891                { INFINIPATH_E_SDMARPYTAG, "SDmaRpyTag" },
 892                { INFINIPATH_E_SDMADWEN, "SDmaDwEn" },
 893                { INFINIPATH_E_SDMAMISSINGDW, "SDmaMissingDw" },
 894                { INFINIPATH_E_SDMAUNEXPDATA, "SDmaUnexpData" },
 895                { INFINIPATH_E_SDMADESCADDRMISALIGN, "SDmaDescAddrMisalign" },
 896                { INFINIPATH_E_SENDBUFMISUSE, "SendBufMisuse" },
 897                { INFINIPATH_E_SDMADISABLED, "SDmaDisabled" },
 898        };
 899        int i;
 900        int expected;
 901        size_t bidx = 0;
 902
 903        for (i = 0; i < ARRAY_SIZE(errs); i++) {
 904                expected = (errs[i].err != INFINIPATH_E_SDMADISABLED) ? 0 :
 905                        test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status);
 906                if ((err & errs[i].err) && !expected)
 907                        bidx += snprintf(buf + bidx, blen - bidx,
 908                                         "%s ", errs[i].msg);
 909        }
 910}
 911
 912/*
 913 * Decode the error status into strings, deciding whether to always
 914 * print * it or not depending on "normal packet errors" vs everything
 915 * else.   Return 1 if "real" errors, otherwise 0 if only packet
 916 * errors, so caller can decide what to print with the string.
 917 */
 918int ipath_decode_err(struct ipath_devdata *dd, char *buf, size_t blen,
 919        ipath_err_t err)
 920{
 921        int iserr = 1;
 922        *buf = '\0';
 923        if (err & INFINIPATH_E_PKTERRS) {
 924                if (!(err & ~INFINIPATH_E_PKTERRS))
 925                        iserr = 0; // if only packet errors.
 926                if (ipath_debug & __IPATH_ERRPKTDBG) {
 927                        if (err & INFINIPATH_E_REBP)
 928                                strlcat(buf, "EBP ", blen);
 929                        if (err & INFINIPATH_E_RVCRC)
 930                                strlcat(buf, "VCRC ", blen);
 931                        if (err & INFINIPATH_E_RICRC) {
 932                                strlcat(buf, "CRC ", blen);
 933                                // clear for check below, so only once
 934                                err &= INFINIPATH_E_RICRC;
 935                        }
 936                        if (err & INFINIPATH_E_RSHORTPKTLEN)
 937                                strlcat(buf, "rshortpktlen ", blen);
 938                        if (err & INFINIPATH_E_SDROPPEDDATAPKT)
 939                                strlcat(buf, "sdroppeddatapkt ", blen);
 940                        if (err & INFINIPATH_E_SPKTLEN)
 941                                strlcat(buf, "spktlen ", blen);
 942                }
 943                if ((err & INFINIPATH_E_RICRC) &&
 944                        !(err&(INFINIPATH_E_RVCRC|INFINIPATH_E_REBP)))
 945                        strlcat(buf, "CRC ", blen);
 946                if (!iserr)
 947                        goto done;
 948        }
 949        if (err & INFINIPATH_E_RHDRLEN)
 950                strlcat(buf, "rhdrlen ", blen);
 951        if (err & INFINIPATH_E_RBADTID)
 952                strlcat(buf, "rbadtid ", blen);
 953        if (err & INFINIPATH_E_RBADVERSION)
 954                strlcat(buf, "rbadversion ", blen);
 955        if (err & INFINIPATH_E_RHDR)
 956                strlcat(buf, "rhdr ", blen);
 957        if (err & INFINIPATH_E_SENDSPECIALTRIGGER)
 958                strlcat(buf, "sendspecialtrigger ", blen);
 959        if (err & INFINIPATH_E_RLONGPKTLEN)
 960                strlcat(buf, "rlongpktlen ", blen);
 961        if (err & INFINIPATH_E_RMAXPKTLEN)
 962                strlcat(buf, "rmaxpktlen ", blen);
 963        if (err & INFINIPATH_E_RMINPKTLEN)
 964                strlcat(buf, "rminpktlen ", blen);
 965        if (err & INFINIPATH_E_SMINPKTLEN)
 966                strlcat(buf, "sminpktlen ", blen);
 967        if (err & INFINIPATH_E_RFORMATERR)
 968                strlcat(buf, "rformaterr ", blen);
 969        if (err & INFINIPATH_E_RUNSUPVL)
 970                strlcat(buf, "runsupvl ", blen);
 971        if (err & INFINIPATH_E_RUNEXPCHAR)
 972                strlcat(buf, "runexpchar ", blen);
 973        if (err & INFINIPATH_E_RIBFLOW)
 974                strlcat(buf, "ribflow ", blen);
 975        if (err & INFINIPATH_E_SUNDERRUN)
 976                strlcat(buf, "sunderrun ", blen);
 977        if (err & INFINIPATH_E_SPIOARMLAUNCH)
 978                strlcat(buf, "spioarmlaunch ", blen);
 979        if (err & INFINIPATH_E_SUNEXPERRPKTNUM)
 980                strlcat(buf, "sunexperrpktnum ", blen);
 981        if (err & INFINIPATH_E_SDROPPEDSMPPKT)
 982                strlcat(buf, "sdroppedsmppkt ", blen);
 983        if (err & INFINIPATH_E_SMAXPKTLEN)
 984                strlcat(buf, "smaxpktlen ", blen);
 985        if (err & INFINIPATH_E_SUNSUPVL)
 986                strlcat(buf, "sunsupVL ", blen);
 987        if (err & INFINIPATH_E_INVALIDADDR)
 988                strlcat(buf, "invalidaddr ", blen);
 989        if (err & INFINIPATH_E_RRCVEGRFULL)
 990                strlcat(buf, "rcvegrfull ", blen);
 991        if (err & INFINIPATH_E_RRCVHDRFULL)
 992                strlcat(buf, "rcvhdrfull ", blen);
 993        if (err & INFINIPATH_E_IBSTATUSCHANGED)
 994                strlcat(buf, "ibcstatuschg ", blen);
 995        if (err & INFINIPATH_E_RIBLOSTLINK)
 996                strlcat(buf, "riblostlink ", blen);
 997        if (err & INFINIPATH_E_HARDWARE)
 998                strlcat(buf, "hardware ", blen);
 999        if (err & INFINIPATH_E_RESET)
1000                strlcat(buf, "reset ", blen);
1001        if (err & INFINIPATH_E_SDMAERRS)
1002                decode_sdma_errs(dd, err, buf, blen);
1003        if (err & INFINIPATH_E_INVALIDEEPCMD)
1004                strlcat(buf, "invalideepromcmd ", blen);
1005done:
1006        return iserr;
1007}
1008
1009/**
1010 * get_rhf_errstring - decode RHF errors
1011 * @err: the err number
1012 * @msg: the output buffer
1013 * @len: the length of the output buffer
1014 *
1015 * only used one place now, may want more later
1016 */
1017static void get_rhf_errstring(u32 err, char *msg, size_t len)
1018{
1019        /* if no errors, and so don't need to check what's first */
1020        *msg = '\0';
1021
1022        if (err & INFINIPATH_RHF_H_ICRCERR)
1023                strlcat(msg, "icrcerr ", len);
1024        if (err & INFINIPATH_RHF_H_VCRCERR)
1025                strlcat(msg, "vcrcerr ", len);
1026        if (err & INFINIPATH_RHF_H_PARITYERR)
1027                strlcat(msg, "parityerr ", len);
1028        if (err & INFINIPATH_RHF_H_LENERR)
1029                strlcat(msg, "lenerr ", len);
1030        if (err & INFINIPATH_RHF_H_MTUERR)
1031                strlcat(msg, "mtuerr ", len);
1032        if (err & INFINIPATH_RHF_H_IHDRERR)
1033                /* infinipath hdr checksum error */
1034                strlcat(msg, "ipathhdrerr ", len);
1035        if (err & INFINIPATH_RHF_H_TIDERR)
1036                strlcat(msg, "tiderr ", len);
1037        if (err & INFINIPATH_RHF_H_MKERR)
1038                /* bad port, offset, etc. */
1039                strlcat(msg, "invalid ipathhdr ", len);
1040        if (err & INFINIPATH_RHF_H_IBERR)
1041                strlcat(msg, "iberr ", len);
1042        if (err & INFINIPATH_RHF_L_SWA)
1043                strlcat(msg, "swA ", len);
1044        if (err & INFINIPATH_RHF_L_SWB)
1045                strlcat(msg, "swB ", len);
1046}
1047
1048/**
1049 * ipath_get_egrbuf - get an eager buffer
1050 * @dd: the infinipath device
1051 * @bufnum: the eager buffer to get
1052 *
1053 * must only be called if ipath_pd[port] is known to be allocated
1054 */
1055static inline void *ipath_get_egrbuf(struct ipath_devdata *dd, u32 bufnum)
1056{
1057        return dd->ipath_port0_skbinfo ?
1058                (void *) dd->ipath_port0_skbinfo[bufnum].skb->data : NULL;
1059}
1060
1061/**
1062 * ipath_alloc_skb - allocate an skb and buffer with possible constraints
1063 * @dd: the infinipath device
1064 * @gfp_mask: the sk_buff SFP mask
1065 */
1066struct sk_buff *ipath_alloc_skb(struct ipath_devdata *dd,
1067                                gfp_t gfp_mask)
1068{
1069        struct sk_buff *skb;
1070        u32 len;
1071
1072        /*
1073         * Only fully supported way to handle this is to allocate lots
1074         * extra, align as needed, and then do skb_reserve().  That wastes
1075         * a lot of memory...  I'll have to hack this into infinipath_copy
1076         * also.
1077         */
1078
1079        /*
1080         * We need 2 extra bytes for ipath_ether data sent in the
1081         * key header.  In order to keep everything dword aligned,
1082         * we'll reserve 4 bytes.
1083         */
1084        len = dd->ipath_ibmaxlen + 4;
1085
1086        if (dd->ipath_flags & IPATH_4BYTE_TID) {
1087                /* We need a 2KB multiple alignment, and there is no way
1088                 * to do it except to allocate extra and then skb_reserve
1089                 * enough to bring it up to the right alignment.
1090                 */
1091                len += 2047;
1092        }
1093
1094        skb = __dev_alloc_skb(len, gfp_mask);
1095        if (!skb) {
1096                ipath_dev_err(dd, "Failed to allocate skbuff, length %u\n",
1097                              len);
1098                goto bail;
1099        }
1100
1101        skb_reserve(skb, 4);
1102
1103        if (dd->ipath_flags & IPATH_4BYTE_TID) {
1104                u32 una = (unsigned long)skb->data & 2047;
1105                if (una)
1106                        skb_reserve(skb, 2048 - una);
1107        }
1108
1109bail:
1110        return skb;
1111}
1112
1113static void ipath_rcv_hdrerr(struct ipath_devdata *dd,
1114                             u32 eflags,
1115                             u32 l,
1116                             u32 etail,
1117                             __le32 *rhf_addr,
1118                             struct ipath_message_header *hdr)
1119{
1120        char emsg[128];
1121
1122        get_rhf_errstring(eflags, emsg, sizeof emsg);
1123        ipath_cdbg(PKT, "RHFerrs %x hdrqtail=%x typ=%u "
1124                   "tlen=%x opcode=%x egridx=%x: %s\n",
1125                   eflags, l,
1126                   ipath_hdrget_rcv_type(rhf_addr),
1127                   ipath_hdrget_length_in_bytes(rhf_addr),
1128                   be32_to_cpu(hdr->bth[0]) >> 24,
1129                   etail, emsg);
1130
1131        /* Count local link integrity errors. */
1132        if (eflags & (INFINIPATH_RHF_H_ICRCERR | INFINIPATH_RHF_H_VCRCERR)) {
1133                u8 n = (dd->ipath_ibcctrl >>
1134                        INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) &
1135                        INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK;
1136
1137                if (++dd->ipath_lli_counter > n) {
1138                        dd->ipath_lli_counter = 0;
1139                        dd->ipath_lli_errors++;
1140                }
1141        }
1142}
1143
1144/*
1145 * ipath_kreceive - receive a packet
1146 * @pd: the infinipath port
1147 *
1148 * called from interrupt handler for errors or receive interrupt
1149 */
1150void ipath_kreceive(struct ipath_portdata *pd)
1151{
1152        struct ipath_devdata *dd = pd->port_dd;
1153        __le32 *rhf_addr;
1154        void *ebuf;
1155        const u32 rsize = dd->ipath_rcvhdrentsize;      /* words */
1156        const u32 maxcnt = dd->ipath_rcvhdrcnt * rsize; /* words */
1157        u32 etail = -1, l, hdrqtail;
1158        struct ipath_message_header *hdr;
1159        u32 eflags, i, etype, tlen, pkttot = 0, updegr = 0, reloop = 0;
1160        static u64 totcalls;    /* stats, may eventually remove */
1161        int last;
1162
1163        l = pd->port_head;
1164        rhf_addr = (__le32 *) pd->port_rcvhdrq + l + dd->ipath_rhf_offset;
1165        if (dd->ipath_flags & IPATH_NODMA_RTAIL) {
1166                u32 seq = ipath_hdrget_seq(rhf_addr);
1167
1168                if (seq != pd->port_seq_cnt)
1169                        goto bail;
1170                hdrqtail = 0;
1171        } else {
1172                hdrqtail = ipath_get_rcvhdrtail(pd);
1173                if (l == hdrqtail)
1174                        goto bail;
1175                smp_rmb();
1176        }
1177
1178reloop:
1179        for (last = 0, i = 1; !last; i += !last) {
1180                hdr = dd->ipath_f_get_msgheader(dd, rhf_addr);
1181                eflags = ipath_hdrget_err_flags(rhf_addr);
1182                etype = ipath_hdrget_rcv_type(rhf_addr);
1183                /* total length */
1184                tlen = ipath_hdrget_length_in_bytes(rhf_addr);
1185                ebuf = NULL;
1186                if ((dd->ipath_flags & IPATH_NODMA_RTAIL) ?
1187                    ipath_hdrget_use_egr_buf(rhf_addr) :
1188                    (etype != RCVHQ_RCV_TYPE_EXPECTED)) {
1189                        /*
1190                         * It turns out that the chip uses an eager buffer
1191                         * for all non-expected packets, whether it "needs"
1192                         * one or not.  So always get the index, but don't
1193                         * set ebuf (so we try to copy data) unless the
1194                         * length requires it.
1195                         */
1196                        etail = ipath_hdrget_index(rhf_addr);
1197                        updegr = 1;
1198                        if (tlen > sizeof(*hdr) ||
1199                            etype == RCVHQ_RCV_TYPE_NON_KD)
1200                                ebuf = ipath_get_egrbuf(dd, etail);
1201                }
1202
1203                /*
1204                 * both tiderr and ipathhdrerr are set for all plain IB
1205                 * packets; only ipathhdrerr should be set.
1206                 */
1207
1208                if (etype != RCVHQ_RCV_TYPE_NON_KD &&
1209                    etype != RCVHQ_RCV_TYPE_ERROR &&
1210                    ipath_hdrget_ipath_ver(hdr->iph.ver_port_tid_offset) !=
1211                    IPS_PROTO_VERSION)
1212                        ipath_cdbg(PKT, "Bad InfiniPath protocol version "
1213                                   "%x\n", etype);
1214
1215                if (unlikely(eflags))
1216                        ipath_rcv_hdrerr(dd, eflags, l, etail, rhf_addr, hdr);
1217                else if (etype == RCVHQ_RCV_TYPE_NON_KD) {
1218                        ipath_ib_rcv(dd->verbs_dev, (u32 *)hdr, ebuf, tlen);
1219                        if (dd->ipath_lli_counter)
1220                                dd->ipath_lli_counter--;
1221                } else if (etype == RCVHQ_RCV_TYPE_EAGER) {
1222                        u8 opcode = be32_to_cpu(hdr->bth[0]) >> 24;
1223                        u32 qp = be32_to_cpu(hdr->bth[1]) & 0xffffff;
1224                        ipath_cdbg(PKT, "typ %x, opcode %x (eager, "
1225                                   "qp=%x), len %x; ignored\n",
1226                                   etype, opcode, qp, tlen);
1227                }
1228                else if (etype == RCVHQ_RCV_TYPE_EXPECTED)
1229                        ipath_dbg("Bug: Expected TID, opcode %x; ignored\n",
1230                                  be32_to_cpu(hdr->bth[0]) >> 24);
1231                else {
1232                        /*
1233                         * error packet, type of error unknown.
1234                         * Probably type 3, but we don't know, so don't
1235                         * even try to print the opcode, etc.
1236                         * Usually caused by a "bad packet", that has no
1237                         * BTH, when the LRH says it should.
1238                         */
1239                        ipath_cdbg(ERRPKT, "Error Pkt, but no eflags! egrbuf"
1240                                  " %x, len %x hdrq+%x rhf: %Lx\n",
1241                                  etail, tlen, l, (unsigned long long)
1242                                  le64_to_cpu(*(__le64 *) rhf_addr));
1243                        if (ipath_debug & __IPATH_ERRPKTDBG) {
1244                                u32 j, *d, dw = rsize-2;
1245                                if (rsize > (tlen>>2))
1246                                        dw = tlen>>2;
1247                                d = (u32 *)hdr;
1248                                printk(KERN_DEBUG "EPkt rcvhdr(%x dw):\n",
1249                                        dw);
1250                                for (j = 0; j < dw; j++)
1251                                        printk(KERN_DEBUG "%8x%s", d[j],
1252                                                (j%8) == 7 ? "\n" : " ");
1253                                printk(KERN_DEBUG ".\n");
1254                        }
1255                }
1256                l += rsize;
1257                if (l >= maxcnt)
1258                        l = 0;
1259                rhf_addr = (__le32 *) pd->port_rcvhdrq +
1260                        l + dd->ipath_rhf_offset;
1261                if (dd->ipath_flags & IPATH_NODMA_RTAIL) {
1262                        u32 seq = ipath_hdrget_seq(rhf_addr);
1263
1264                        if (++pd->port_seq_cnt > 13)
1265                                pd->port_seq_cnt = 1;
1266                        if (seq != pd->port_seq_cnt)
1267                                last = 1;
1268                } else if (l == hdrqtail)
1269                        last = 1;
1270                /*
1271                 * update head regs on last packet, and every 16 packets.
1272                 * Reduce bus traffic, while still trying to prevent
1273                 * rcvhdrq overflows, for when the queue is nearly full
1274                 */
1275                if (last || !(i & 0xf)) {
1276                        u64 lval = l;
1277
1278                        /* request IBA6120 and 7220 interrupt only on last */
1279                        if (last)
1280                                lval |= dd->ipath_rhdrhead_intr_off;
1281                        ipath_write_ureg(dd, ur_rcvhdrhead, lval,
1282                                pd->port_port);
1283                        if (updegr) {
1284                                ipath_write_ureg(dd, ur_rcvegrindexhead,
1285                                                 etail, pd->port_port);
1286                                updegr = 0;
1287                        }
1288                }
1289        }
1290
1291        if (!dd->ipath_rhdrhead_intr_off && !reloop &&
1292            !(dd->ipath_flags & IPATH_NODMA_RTAIL)) {
1293                /* IBA6110 workaround; we can have a race clearing chip
1294                 * interrupt with another interrupt about to be delivered,
1295                 * and can clear it before it is delivered on the GPIO
1296                 * workaround.  By doing the extra check here for the
1297                 * in-memory tail register updating while we were doing
1298                 * earlier packets, we "almost" guarantee we have covered
1299                 * that case.
1300                 */
1301                u32 hqtail = ipath_get_rcvhdrtail(pd);
1302                if (hqtail != hdrqtail) {
1303                        hdrqtail = hqtail;
1304                        reloop = 1; /* loop 1 extra time at most */
1305                        goto reloop;
1306                }
1307        }
1308
1309        pkttot += i;
1310
1311        pd->port_head = l;
1312
1313        if (pkttot > ipath_stats.sps_maxpkts_call)
1314                ipath_stats.sps_maxpkts_call = pkttot;
1315        ipath_stats.sps_port0pkts += pkttot;
1316        ipath_stats.sps_avgpkts_call =
1317                ipath_stats.sps_port0pkts / ++totcalls;
1318
1319bail:;
1320}
1321
1322/**
1323 * ipath_update_pio_bufs - update shadow copy of the PIO availability map
1324 * @dd: the infinipath device
1325 *
1326 * called whenever our local copy indicates we have run out of send buffers
1327 * NOTE: This can be called from interrupt context by some code
1328 * and from non-interrupt context by ipath_getpiobuf().
1329 */
1330
1331static void ipath_update_pio_bufs(struct ipath_devdata *dd)
1332{
1333        unsigned long flags;
1334        int i;
1335        const unsigned piobregs = (unsigned)dd->ipath_pioavregs;
1336
1337        /* If the generation (check) bits have changed, then we update the
1338         * busy bit for the corresponding PIO buffer.  This algorithm will
1339         * modify positions to the value they already have in some cases
1340         * (i.e., no change), but it's faster than changing only the bits
1341         * that have changed.
1342         *
1343         * We would like to do this atomicly, to avoid spinlocks in the
1344         * critical send path, but that's not really possible, given the
1345         * type of changes, and that this routine could be called on
1346         * multiple cpu's simultaneously, so we lock in this routine only,
1347         * to avoid conflicting updates; all we change is the shadow, and
1348         * it's a single 64 bit memory location, so by definition the update
1349         * is atomic in terms of what other cpu's can see in testing the
1350         * bits.  The spin_lock overhead isn't too bad, since it only
1351         * happens when all buffers are in use, so only cpu overhead, not
1352         * latency or bandwidth is affected.
1353         */
1354        if (!dd->ipath_pioavailregs_dma) {
1355                ipath_dbg("Update shadow pioavail, but regs_dma NULL!\n");
1356                return;
1357        }
1358        if (ipath_debug & __IPATH_VERBDBG) {
1359                /* only if packet debug and verbose */
1360                volatile __le64 *dma = dd->ipath_pioavailregs_dma;
1361                unsigned long *shadow = dd->ipath_pioavailshadow;
1362
1363                ipath_cdbg(PKT, "Refill avail, dma0=%llx shad0=%lx, "
1364                           "d1=%llx s1=%lx, d2=%llx s2=%lx, d3=%llx "
1365                           "s3=%lx\n",
1366                           (unsigned long long) le64_to_cpu(dma[0]),
1367                           shadow[0],
1368                           (unsigned long long) le64_to_cpu(dma[1]),
1369                           shadow[1],
1370                           (unsigned long long) le64_to_cpu(dma[2]),
1371                           shadow[2],
1372                           (unsigned long long) le64_to_cpu(dma[3]),
1373                           shadow[3]);
1374                if (piobregs > 4)
1375                        ipath_cdbg(
1376                                PKT, "2nd group, dma4=%llx shad4=%lx, "
1377                                "d5=%llx s5=%lx, d6=%llx s6=%lx, "
1378                                "d7=%llx s7=%lx\n",
1379                                (unsigned long long) le64_to_cpu(dma[4]),
1380                                shadow[4],
1381                                (unsigned long long) le64_to_cpu(dma[5]),
1382                                shadow[5],
1383                                (unsigned long long) le64_to_cpu(dma[6]),
1384                                shadow[6],
1385                                (unsigned long long) le64_to_cpu(dma[7]),
1386                                shadow[7]);
1387        }
1388        spin_lock_irqsave(&ipath_pioavail_lock, flags);
1389        for (i = 0; i < piobregs; i++) {
1390                u64 pchbusy, pchg, piov, pnew;
1391                /*
1392                 * Chip Errata: bug 6641; even and odd qwords>3 are swapped
1393                 */
1394                if (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS))
1395                        piov = le64_to_cpu(dd->ipath_pioavailregs_dma[i ^ 1]);
1396                else
1397                        piov = le64_to_cpu(dd->ipath_pioavailregs_dma[i]);
1398                pchg = dd->ipath_pioavailkernel[i] &
1399                        ~(dd->ipath_pioavailshadow[i] ^ piov);
1400                pchbusy = pchg << INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT;
1401                if (pchg && (pchbusy & dd->ipath_pioavailshadow[i])) {
1402                        pnew = dd->ipath_pioavailshadow[i] & ~pchbusy;
1403                        pnew |= piov & pchbusy;
1404                        dd->ipath_pioavailshadow[i] = pnew;
1405                }
1406        }
1407        spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
1408}
1409
1410/*
1411 * used to force update of pioavailshadow if we can't get a pio buffer.
1412 * Needed primarily due to exitting freeze mode after recovering
1413 * from errors.  Done lazily, because it's safer (known to not
1414 * be writing pio buffers).
1415 */
1416static void ipath_reset_availshadow(struct ipath_devdata *dd)
1417{
1418        int i, im;
1419        unsigned long flags;
1420
1421        spin_lock_irqsave(&ipath_pioavail_lock, flags);
1422        for (i = 0; i < dd->ipath_pioavregs; i++) {
1423                u64 val, oldval;
1424                /* deal with 6110 chip bug on high register #s */
1425                im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ?
1426                        i ^ 1 : i;
1427                val = le64_to_cpu(dd->ipath_pioavailregs_dma[im]);
1428                /*
1429                 * busy out the buffers not in the kernel avail list,
1430                 * without changing the generation bits.
1431                 */
1432                oldval = dd->ipath_pioavailshadow[i];
1433                dd->ipath_pioavailshadow[i] = val |
1434                        ((~dd->ipath_pioavailkernel[i] <<
1435                        INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT) &
1436                        0xaaaaaaaaaaaaaaaaULL); /* All BUSY bits in qword */
1437                if (oldval != dd->ipath_pioavailshadow[i])
1438                        ipath_dbg("shadow[%d] was %Lx, now %lx\n",
1439                                i, (unsigned long long) oldval,
1440                                dd->ipath_pioavailshadow[i]);
1441        }
1442        spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
1443}
1444
1445/**
1446 * ipath_setrcvhdrsize - set the receive header size
1447 * @dd: the infinipath device
1448 * @rhdrsize: the receive header size
1449 *
1450 * called from user init code, and also layered driver init
1451 */
1452int ipath_setrcvhdrsize(struct ipath_devdata *dd, unsigned rhdrsize)
1453{
1454        int ret = 0;
1455
1456        if (dd->ipath_flags & IPATH_RCVHDRSZ_SET) {
1457                if (dd->ipath_rcvhdrsize != rhdrsize) {
1458                        dev_info(&dd->pcidev->dev,
1459                                 "Error: can't set protocol header "
1460                                 "size %u, already %u\n",
1461                                 rhdrsize, dd->ipath_rcvhdrsize);
1462                        ret = -EAGAIN;
1463                } else
1464                        ipath_cdbg(VERBOSE, "Reuse same protocol header "
1465                                   "size %u\n", dd->ipath_rcvhdrsize);
1466        } else if (rhdrsize > (dd->ipath_rcvhdrentsize -
1467                               (sizeof(u64) / sizeof(u32)))) {
1468                ipath_dbg("Error: can't set protocol header size %u "
1469                          "(> max %u)\n", rhdrsize,
1470                          dd->ipath_rcvhdrentsize -
1471                          (u32) (sizeof(u64) / sizeof(u32)));
1472                ret = -EOVERFLOW;
1473        } else {
1474                dd->ipath_flags |= IPATH_RCVHDRSZ_SET;
1475                dd->ipath_rcvhdrsize = rhdrsize;
1476                ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrsize,
1477                                 dd->ipath_rcvhdrsize);
1478                ipath_cdbg(VERBOSE, "Set protocol header size to %u\n",
1479                           dd->ipath_rcvhdrsize);
1480        }
1481        return ret;
1482}
1483
1484/*
1485 * debugging code and stats updates if no pio buffers available.
1486 */
1487static noinline void no_pio_bufs(struct ipath_devdata *dd)
1488{
1489        unsigned long *shadow = dd->ipath_pioavailshadow;
1490        __le64 *dma = (__le64 *)dd->ipath_pioavailregs_dma;
1491
1492        dd->ipath_upd_pio_shadow = 1;
1493
1494        /*
1495         * not atomic, but if we lose a stat count in a while, that's OK
1496         */
1497        ipath_stats.sps_nopiobufs++;
1498        if (!(++dd->ipath_consec_nopiobuf % 100000)) {
1499                ipath_force_pio_avail_update(dd); /* at start */
1500                ipath_dbg("%u tries no piobufavail ts%lx; dmacopy: "
1501                        "%llx %llx %llx %llx\n"
1502                        "ipath  shadow:  %lx %lx %lx %lx\n",
1503                        dd->ipath_consec_nopiobuf,
1504                        (unsigned long)get_cycles(),
1505                        (unsigned long long) le64_to_cpu(dma[0]),
1506                        (unsigned long long) le64_to_cpu(dma[1]),
1507                        (unsigned long long) le64_to_cpu(dma[2]),
1508                        (unsigned long long) le64_to_cpu(dma[3]),
1509                        shadow[0], shadow[1], shadow[2], shadow[3]);
1510                /*
1511                 * 4 buffers per byte, 4 registers above, cover rest
1512                 * below
1513                 */
1514                if ((dd->ipath_piobcnt2k + dd->ipath_piobcnt4k) >
1515                    (sizeof(shadow[0]) * 4 * 4))
1516                        ipath_dbg("2nd group: dmacopy: "
1517                                  "%llx %llx %llx %llx\n"
1518                                  "ipath  shadow:  %lx %lx %lx %lx\n",
1519                                  (unsigned long long)le64_to_cpu(dma[4]),
1520                                  (unsigned long long)le64_to_cpu(dma[5]),
1521                                  (unsigned long long)le64_to_cpu(dma[6]),
1522                                  (unsigned long long)le64_to_cpu(dma[7]),
1523                                  shadow[4], shadow[5], shadow[6], shadow[7]);
1524
1525                /* at end, so update likely happened */
1526                ipath_reset_availshadow(dd);
1527        }
1528}
1529
1530/*
1531 * common code for normal driver pio buffer allocation, and reserved
1532 * allocation.
1533 *
1534 * do appropriate marking as busy, etc.
1535 * returns buffer number if one found (>=0), negative number is error.
1536 */
1537static u32 __iomem *ipath_getpiobuf_range(struct ipath_devdata *dd,
1538        u32 *pbufnum, u32 first, u32 last, u32 firsti)
1539{
1540        int i, j, updated = 0;
1541        unsigned piobcnt;
1542        unsigned long flags;
1543        unsigned long *shadow = dd->ipath_pioavailshadow;
1544        u32 __iomem *buf;
1545
1546        piobcnt = last - first;
1547        if (dd->ipath_upd_pio_shadow) {
1548                /*
1549                 * Minor optimization.  If we had no buffers on last call,
1550                 * start out by doing the update; continue and do scan even
1551                 * if no buffers were updated, to be paranoid
1552                 */
1553                ipath_update_pio_bufs(dd);
1554                updated++;
1555                i = first;
1556        } else
1557                i = firsti;
1558rescan:
1559        /*
1560         * while test_and_set_bit() is atomic, we do that and then the
1561         * change_bit(), and the pair is not.  See if this is the cause
1562         * of the remaining armlaunch errors.
1563         */
1564        spin_lock_irqsave(&ipath_pioavail_lock, flags);
1565        for (j = 0; j < piobcnt; j++, i++) {
1566                if (i >= last)
1567                        i = first;
1568                if (__test_and_set_bit((2 * i) + 1, shadow))
1569                        continue;
1570                /* flip generation bit */
1571                __change_bit(2 * i, shadow);
1572                break;
1573        }
1574        spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
1575
1576        if (j == piobcnt) {
1577                if (!updated) {
1578                        /*
1579                         * first time through; shadow exhausted, but may be
1580                         * buffers available, try an update and then rescan.
1581                         */
1582                        ipath_update_pio_bufs(dd);
1583                        updated++;
1584                        i = first;
1585                        goto rescan;
1586                } else if (updated == 1 && piobcnt <=
1587                        ((dd->ipath_sendctrl
1588                        >> INFINIPATH_S_UPDTHRESH_SHIFT) &
1589                        INFINIPATH_S_UPDTHRESH_MASK)) {
1590                        /*
1591                         * for chips supporting and using the update
1592                         * threshold we need to force an update of the
1593                         * in-memory copy if the count is less than the
1594                         * thershold, then check one more time.
1595                         */
1596                        ipath_force_pio_avail_update(dd);
1597                        ipath_update_pio_bufs(dd);
1598                        updated++;
1599                        i = first;
1600                        goto rescan;
1601                }
1602
1603                no_pio_bufs(dd);
1604                buf = NULL;
1605        } else {
1606                if (i < dd->ipath_piobcnt2k)
1607                        buf = (u32 __iomem *) (dd->ipath_pio2kbase +
1608                                               i * dd->ipath_palign);
1609                else
1610                        buf = (u32 __iomem *)
1611                                (dd->ipath_pio4kbase +
1612                                 (i - dd->ipath_piobcnt2k) * dd->ipath_4kalign);
1613                if (pbufnum)
1614                        *pbufnum = i;
1615        }
1616
1617        return buf;
1618}
1619
1620/**
1621 * ipath_getpiobuf - find an available pio buffer
1622 * @dd: the infinipath device
1623 * @plen: the size of the PIO buffer needed in 32-bit words
1624 * @pbufnum: the buffer number is placed here
1625 */
1626u32 __iomem *ipath_getpiobuf(struct ipath_devdata *dd, u32 plen, u32 *pbufnum)
1627{
1628        u32 __iomem *buf;
1629        u32 pnum, nbufs;
1630        u32 first, lasti;
1631
1632        if (plen + 1 >= IPATH_SMALLBUF_DWORDS) {
1633                first = dd->ipath_piobcnt2k;
1634                lasti = dd->ipath_lastpioindexl;
1635        } else {
1636                first = 0;
1637                lasti = dd->ipath_lastpioindex;
1638        }
1639        nbufs = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
1640        buf = ipath_getpiobuf_range(dd, &pnum, first, nbufs, lasti);
1641
1642        if (buf) {
1643                /*
1644                 * Set next starting place.  It's just an optimization,
1645                 * it doesn't matter who wins on this, so no locking
1646                 */
1647                if (plen + 1 >= IPATH_SMALLBUF_DWORDS)
1648                        dd->ipath_lastpioindexl = pnum + 1;
1649                else
1650                        dd->ipath_lastpioindex = pnum + 1;
1651                if (dd->ipath_upd_pio_shadow)
1652                        dd->ipath_upd_pio_shadow = 0;
1653                if (dd->ipath_consec_nopiobuf)
1654                        dd->ipath_consec_nopiobuf = 0;
1655                ipath_cdbg(VERBOSE, "Return piobuf%u %uk @ %p\n",
1656                           pnum, (pnum < dd->ipath_piobcnt2k) ? 2 : 4, buf);
1657                if (pbufnum)
1658                        *pbufnum = pnum;
1659
1660        }
1661        return buf;
1662}
1663
1664/**
1665 * ipath_chg_pioavailkernel - change which send buffers are available for kernel
1666 * @dd: the infinipath device
1667 * @start: the starting send buffer number
1668 * @len: the number of send buffers
1669 * @avail: true if the buffers are available for kernel use, false otherwise
1670 */
1671void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start,
1672                              unsigned len, int avail)
1673{
1674        unsigned long flags;
1675        unsigned end, cnt = 0;
1676
1677        /* There are two bits per send buffer (busy and generation) */
1678        start *= 2;
1679        end = start + len * 2;
1680
1681        spin_lock_irqsave(&ipath_pioavail_lock, flags);
1682        /* Set or clear the busy bit in the shadow. */
1683        while (start < end) {
1684                if (avail) {
1685                        unsigned long dma;
1686                        int i, im;
1687                        /*
1688                         * the BUSY bit will never be set, because we disarm
1689                         * the user buffers before we hand them back to the
1690                         * kernel.  We do have to make sure the generation
1691                         * bit is set correctly in shadow, since it could
1692                         * have changed many times while allocated to user.
1693                         * We can't use the bitmap functions on the full
1694                         * dma array because it is always little-endian, so
1695                         * we have to flip to host-order first.
1696                         * BITS_PER_LONG is slightly wrong, since it's
1697                         * always 64 bits per register in chip...
1698                         * We only work on 64 bit kernels, so that's OK.
1699                         */
1700                        /* deal with 6110 chip bug on high register #s */
1701                        i = start / BITS_PER_LONG;
1702                        im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ?
1703                                i ^ 1 : i;
1704                        __clear_bit(INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT
1705                                + start, dd->ipath_pioavailshadow);
1706                        dma = (unsigned long) le64_to_cpu(
1707                                dd->ipath_pioavailregs_dma[im]);
1708                        if (test_bit((INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT
1709                                + start) % BITS_PER_LONG, &dma))
1710                                __set_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT
1711                                        + start, dd->ipath_pioavailshadow);
1712                        else
1713                                __clear_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT
1714                                        + start, dd->ipath_pioavailshadow);
1715                        __set_bit(start, dd->ipath_pioavailkernel);
1716                } else {
1717                        __set_bit(start + INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT,
1718                                dd->ipath_pioavailshadow);
1719                        __clear_bit(start, dd->ipath_pioavailkernel);
1720                }
1721                start += 2;
1722        }
1723
1724        if (dd->ipath_pioupd_thresh) {
1725                end = 2 * (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k);
1726                cnt = bitmap_weight(dd->ipath_pioavailkernel, end);
1727        }
1728        spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
1729
1730        /*
1731         * When moving buffers from kernel to user, if number assigned to
1732         * the user is less than the pio update threshold, and threshold
1733         * is supported (cnt was computed > 0), drop the update threshold
1734         * so we update at least once per allocated number of buffers.
1735         * In any case, if the kernel buffers are less than the threshold,
1736         * drop the threshold.  We don't bother increasing it, having once
1737         * decreased it, since it would typically just cycle back and forth.
1738         * If we don't decrease below buffers in use, we can wait a long
1739         * time for an update, until some other context uses PIO buffers.
1740         */
1741        if (!avail && len < cnt)
1742                cnt = len;
1743        if (cnt < dd->ipath_pioupd_thresh) {
1744                dd->ipath_pioupd_thresh = cnt;
1745                ipath_dbg("Decreased pio update threshold to %u\n",
1746                        dd->ipath_pioupd_thresh);
1747                spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
1748                dd->ipath_sendctrl &= ~(INFINIPATH_S_UPDTHRESH_MASK
1749                        << INFINIPATH_S_UPDTHRESH_SHIFT);
1750                dd->ipath_sendctrl |= dd->ipath_pioupd_thresh
1751                        << INFINIPATH_S_UPDTHRESH_SHIFT;
1752                ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
1753                        dd->ipath_sendctrl);
1754                spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
1755        }
1756}
1757
1758/**
1759 * ipath_create_rcvhdrq - create a receive header queue
1760 * @dd: the infinipath device
1761 * @pd: the port data
1762 *
1763 * this must be contiguous memory (from an i/o perspective), and must be
1764 * DMA'able (which means for some systems, it will go through an IOMMU,
1765 * or be forced into a low address range).
1766 */
1767int ipath_create_rcvhdrq(struct ipath_devdata *dd,
1768                         struct ipath_portdata *pd)
1769{
1770        int ret = 0;
1771
1772        if (!pd->port_rcvhdrq) {
1773                dma_addr_t phys_hdrqtail;
1774                gfp_t gfp_flags = GFP_USER | __GFP_COMP;
1775                int amt = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize *
1776                                sizeof(u32), PAGE_SIZE);
1777
1778                pd->port_rcvhdrq = dma_alloc_coherent(
1779                        &dd->pcidev->dev, amt, &pd->port_rcvhdrq_phys,
1780                        gfp_flags);
1781
1782                if (!pd->port_rcvhdrq) {
1783                        ipath_dev_err(dd, "attempt to allocate %d bytes "
1784                                      "for port %u rcvhdrq failed\n",
1785                                      amt, pd->port_port);
1786                        ret = -ENOMEM;
1787                        goto bail;
1788                }
1789
1790                if (!(dd->ipath_flags & IPATH_NODMA_RTAIL)) {
1791                        pd->port_rcvhdrtail_kvaddr = dma_alloc_coherent(
1792                                &dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail,
1793                                GFP_KERNEL);
1794                        if (!pd->port_rcvhdrtail_kvaddr) {
1795                                ipath_dev_err(dd, "attempt to allocate 1 page "
1796                                        "for port %u rcvhdrqtailaddr "
1797                                        "failed\n", pd->port_port);
1798                                ret = -ENOMEM;
1799                                dma_free_coherent(&dd->pcidev->dev, amt,
1800                                        pd->port_rcvhdrq,
1801                                        pd->port_rcvhdrq_phys);
1802                                pd->port_rcvhdrq = NULL;
1803                                goto bail;
1804                        }
1805                        pd->port_rcvhdrqtailaddr_phys = phys_hdrqtail;
1806                        ipath_cdbg(VERBOSE, "port %d hdrtailaddr, %llx "
1807                                   "physical\n", pd->port_port,
1808                                   (unsigned long long) phys_hdrqtail);
1809                }
1810
1811                pd->port_rcvhdrq_size = amt;
1812
1813                ipath_cdbg(VERBOSE, "%d pages at %p (phys %lx) size=%lu "
1814                           "for port %u rcvhdr Q\n",
1815                           amt >> PAGE_SHIFT, pd->port_rcvhdrq,
1816                           (unsigned long) pd->port_rcvhdrq_phys,
1817                           (unsigned long) pd->port_rcvhdrq_size,
1818                           pd->port_port);
1819        }
1820        else
1821                ipath_cdbg(VERBOSE, "reuse port %d rcvhdrq @%p %llx phys; "
1822                           "hdrtailaddr@%p %llx physical\n",
1823                           pd->port_port, pd->port_rcvhdrq,
1824                           (unsigned long long) pd->port_rcvhdrq_phys,
1825                           pd->port_rcvhdrtail_kvaddr, (unsigned long long)
1826                           pd->port_rcvhdrqtailaddr_phys);
1827
1828        /* clear for security and sanity on each use */
1829        memset(pd->port_rcvhdrq, 0, pd->port_rcvhdrq_size);
1830        if (pd->port_rcvhdrtail_kvaddr)
1831                memset(pd->port_rcvhdrtail_kvaddr, 0, PAGE_SIZE);
1832
1833        /*
1834         * tell chip each time we init it, even if we are re-using previous
1835         * memory (we zero the register at process close)
1836         */
1837        ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdrtailaddr,
1838                              pd->port_port, pd->port_rcvhdrqtailaddr_phys);
1839        ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr,
1840                              pd->port_port, pd->port_rcvhdrq_phys);
1841
1842bail:
1843        return ret;
1844}
1845
1846
1847/*
1848 * Flush all sends that might be in the ready to send state, as well as any
1849 * that are in the process of being sent.   Used whenever we need to be
1850 * sure the send side is idle.  Cleans up all buffer state by canceling
1851 * all pio buffers, and issuing an abort, which cleans up anything in the
1852 * launch fifo.  The cancel is superfluous on some chip versions, but
1853 * it's safer to always do it.
1854 * PIOAvail bits are updated by the chip as if normal send had happened.
1855 */
1856void ipath_cancel_sends(struct ipath_devdata *dd, int restore_sendctrl)
1857{
1858        unsigned long flags;
1859
1860        if (dd->ipath_flags & IPATH_IB_AUTONEG_INPROG) {
1861                ipath_cdbg(VERBOSE, "Ignore while in autonegotiation\n");
1862                goto bail;
1863        }
1864        /*
1865         * If we have SDMA, and it's not disabled, we have to kick off the
1866         * abort state machine, provided we aren't already aborting.
1867         * If we are in the process of aborting SDMA (!DISABLED, but ABORTING),
1868         * we skip the rest of this routine. It is already "in progress"
1869         */
1870        if (dd->ipath_flags & IPATH_HAS_SEND_DMA) {
1871                int skip_cancel;
1872                unsigned long *statp = &dd->ipath_sdma_status;
1873
1874                spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
1875                skip_cancel =
1876                        test_and_set_bit(IPATH_SDMA_ABORTING, statp)
1877                        && !test_bit(IPATH_SDMA_DISABLED, statp);
1878                spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
1879                if (skip_cancel)
1880                        goto bail;
1881        }
1882
1883        ipath_dbg("Cancelling all in-progress send buffers\n");
1884
1885        /* skip armlaunch errs for a while */
1886        dd->ipath_lastcancel = jiffies + HZ / 2;
1887
1888        /*
1889         * The abort bit is auto-clearing.  We also don't want pioavail
1890         * update happening during this, and we don't want any other
1891         * sends going out, so turn those off for the duration.  We read
1892         * the scratch register to be sure that cancels and the abort
1893         * have taken effect in the chip.  Otherwise two parts are same
1894         * as ipath_force_pio_avail_update()
1895         */
1896        spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
1897        dd->ipath_sendctrl &= ~(INFINIPATH_S_PIOBUFAVAILUPD
1898                | INFINIPATH_S_PIOENABLE);
1899        ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
1900                dd->ipath_sendctrl | INFINIPATH_S_ABORT);
1901        ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
1902        spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
1903
1904        /* disarm all send buffers */
1905        ipath_disarm_piobufs(dd, 0,
1906                dd->ipath_piobcnt2k + dd->ipath_piobcnt4k);
1907
1908        if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
1909                set_bit(IPATH_SDMA_DISARMED, &dd->ipath_sdma_status);
1910
1911        if (restore_sendctrl) {
1912                /* else done by caller later if needed */
1913                spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
1914                dd->ipath_sendctrl |= INFINIPATH_S_PIOBUFAVAILUPD |
1915                        INFINIPATH_S_PIOENABLE;
1916                ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
1917                        dd->ipath_sendctrl);
1918                /* and again, be sure all have hit the chip */
1919                ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
1920                spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
1921        }
1922
1923        if ((dd->ipath_flags & IPATH_HAS_SEND_DMA) &&
1924            !test_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status) &&
1925            test_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status)) {
1926                spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
1927                /* only wait so long for intr */
1928                dd->ipath_sdma_abort_intr_timeout = jiffies + HZ;
1929                dd->ipath_sdma_reset_wait = 200;
1930                if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
1931                        tasklet_hi_schedule(&dd->ipath_sdma_abort_task);
1932                spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
1933        }
1934bail:;
1935}
1936
1937/*
1938 * Force an update of in-memory copy of the pioavail registers, when
1939 * needed for any of a variety of reasons.  We read the scratch register
1940 * to make it highly likely that the update will have happened by the
1941 * time we return.  If already off (as in cancel_sends above), this
1942 * routine is a nop, on the assumption that the caller will "do the
1943 * right thing".
1944 */
1945void ipath_force_pio_avail_update(struct ipath_devdata *dd)
1946{
1947        unsigned long flags;
1948
1949        spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
1950        if (dd->ipath_sendctrl & INFINIPATH_S_PIOBUFAVAILUPD) {
1951                ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
1952                        dd->ipath_sendctrl & ~INFINIPATH_S_PIOBUFAVAILUPD);
1953                ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
1954                ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
1955                        dd->ipath_sendctrl);
1956                ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
1957        }
1958        spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
1959}
1960
1961static void ipath_set_ib_lstate(struct ipath_devdata *dd, int linkcmd,
1962                                int linitcmd)
1963{
1964        u64 mod_wd;
1965        static const char *what[4] = {
1966                [0] = "NOP",
1967                [INFINIPATH_IBCC_LINKCMD_DOWN] = "DOWN",
1968                [INFINIPATH_IBCC_LINKCMD_ARMED] = "ARMED",
1969                [INFINIPATH_IBCC_LINKCMD_ACTIVE] = "ACTIVE"
1970        };
1971
1972        if (linitcmd == INFINIPATH_IBCC_LINKINITCMD_DISABLE) {
1973                /*
1974                 * If we are told to disable, note that so link-recovery
1975                 * code does not attempt to bring us back up.
1976                 */
1977                preempt_disable();
1978                dd->ipath_flags |= IPATH_IB_LINK_DISABLED;
1979                preempt_enable();
1980        } else if (linitcmd) {
1981                /*
1982                 * Any other linkinitcmd will lead to LINKDOWN and then
1983                 * to INIT (if all is well), so clear flag to let
1984                 * link-recovery code attempt to bring us back up.
1985                 */
1986                preempt_disable();
1987                dd->ipath_flags &= ~IPATH_IB_LINK_DISABLED;
1988                preempt_enable();
1989        }
1990
1991        mod_wd = (linkcmd << dd->ibcc_lc_shift) |
1992                (linitcmd << INFINIPATH_IBCC_LINKINITCMD_SHIFT);
1993        ipath_cdbg(VERBOSE,
1994                "Moving unit %u to %s (initcmd=0x%x), current ltstate is %s\n",
1995                dd->ipath_unit, what[linkcmd], linitcmd,
1996                ipath_ibcstatus_str[ipath_ib_linktrstate(dd,
1997                        ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus))]);
1998
1999        ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
2000                         dd->ipath_ibcctrl | mod_wd);
2001        /* read from chip so write is flushed */
2002        (void) ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
2003}
2004
2005int ipath_set_linkstate(struct ipath_devdata *dd, u8 newstate)
2006{
2007        u32 lstate;
2008        int ret;
2009
2010        switch (newstate) {
2011        case IPATH_IB_LINKDOWN_ONLY:
2012                ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN, 0);
2013                /* don't wait */
2014                ret = 0;
2015                goto bail;
2016
2017        case IPATH_IB_LINKDOWN:
2018                ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN,
2019                                        INFINIPATH_IBCC_LINKINITCMD_POLL);
2020                /* don't wait */
2021                ret = 0;
2022                goto bail;
2023
2024        case IPATH_IB_LINKDOWN_SLEEP:
2025                ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN,
2026                                        INFINIPATH_IBCC_LINKINITCMD_SLEEP);
2027                /* don't wait */
2028                ret = 0;
2029                goto bail;
2030
2031        case IPATH_IB_LINKDOWN_DISABLE:
2032                ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN,
2033                                        INFINIPATH_IBCC_LINKINITCMD_DISABLE);
2034                /* don't wait */
2035                ret = 0;
2036                goto bail;
2037
2038        case IPATH_IB_LINKARM:
2039                if (dd->ipath_flags & IPATH_LINKARMED) {
2040                        ret = 0;
2041                        goto bail;
2042                }
2043                if (!(dd->ipath_flags &
2044                      (IPATH_LINKINIT | IPATH_LINKACTIVE))) {
2045                        ret = -EINVAL;
2046                        goto bail;
2047                }
2048                ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_ARMED, 0);
2049
2050                /*
2051                 * Since the port can transition to ACTIVE by receiving
2052                 * a non VL 15 packet, wait for either state.
2053                 */
2054                lstate = IPATH_LINKARMED | IPATH_LINKACTIVE;
2055                break;
2056
2057        case IPATH_IB_LINKACTIVE:
2058                if (dd->ipath_flags & IPATH_LINKACTIVE) {
2059                        ret = 0;
2060                        goto bail;
2061                }
2062                if (!(dd->ipath_flags & IPATH_LINKARMED)) {
2063                        ret = -EINVAL;
2064                        goto bail;
2065                }
2066                ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_ACTIVE, 0);
2067                lstate = IPATH_LINKACTIVE;
2068                break;
2069
2070        case IPATH_IB_LINK_LOOPBACK:
2071                dev_info(&dd->pcidev->dev, "Enabling IB local loopback\n");
2072                dd->ipath_ibcctrl |= INFINIPATH_IBCC_LOOPBACK;
2073                ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
2074                                 dd->ipath_ibcctrl);
2075
2076                /* turn heartbeat off, as it causes loopback to fail */
2077                dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
2078                                       IPATH_IB_HRTBT_OFF);
2079                /* don't wait */
2080                ret = 0;
2081                goto bail;
2082
2083        case IPATH_IB_LINK_EXTERNAL:
2084                dev_info(&dd->pcidev->dev,
2085                        "Disabling IB local loopback (normal)\n");
2086                dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
2087                                       IPATH_IB_HRTBT_ON);
2088                dd->ipath_ibcctrl &= ~INFINIPATH_IBCC_LOOPBACK;
2089                ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
2090                                 dd->ipath_ibcctrl);
2091                /* don't wait */
2092                ret = 0;
2093                goto bail;
2094
2095        /*
2096         * Heartbeat can be explicitly enabled by the user via
2097         * "hrtbt_enable" "file", and if disabled, trying to enable here
2098         * will have no effect.  Implicit changes (heartbeat off when
2099         * loopback on, and vice versa) are included to ease testing.
2100         */
2101        case IPATH_IB_LINK_HRTBT:
2102                ret = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
2103                        IPATH_IB_HRTBT_ON);
2104                goto bail;
2105
2106        case IPATH_IB_LINK_NO_HRTBT:
2107                ret = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
2108                        IPATH_IB_HRTBT_OFF);
2109                goto bail;
2110
2111        default:
2112                ipath_dbg("Invalid linkstate 0x%x requested\n", newstate);
2113                ret = -EINVAL;
2114                goto bail;
2115        }
2116        ret = ipath_wait_linkstate(dd, lstate, 2000);
2117
2118bail:
2119        return ret;
2120}
2121
2122/**
2123 * ipath_set_mtu - set the MTU
2124 * @dd: the infinipath device
2125 * @arg: the new MTU
2126 *
2127 * we can handle "any" incoming size, the issue here is whether we
2128 * need to restrict our outgoing size.   For now, we don't do any
2129 * sanity checking on this, and we don't deal with what happens to
2130 * programs that are already running when the size changes.
2131 * NOTE: changing the MTU will usually cause the IBC to go back to
2132 * link INIT state...
2133 */
2134int ipath_set_mtu(struct ipath_devdata *dd, u16 arg)
2135{
2136        u32 piosize;
2137        int changed = 0;
2138        int ret;
2139
2140        /*
2141         * mtu is IB data payload max.  It's the largest power of 2 less
2142         * than piosize (or even larger, since it only really controls the
2143         * largest we can receive; we can send the max of the mtu and
2144         * piosize).  We check that it's one of the valid IB sizes.
2145         */
2146        if (arg != 256 && arg != 512 && arg != 1024 && arg != 2048 &&
2147            (arg != 4096 || !ipath_mtu4096)) {
2148                ipath_dbg("Trying to set invalid mtu %u, failing\n", arg);
2149                ret = -EINVAL;
2150                goto bail;
2151        }
2152        if (dd->ipath_ibmtu == arg) {
2153                ret = 0;        /* same as current */
2154                goto bail;
2155        }
2156
2157        piosize = dd->ipath_ibmaxlen;
2158        dd->ipath_ibmtu = arg;
2159
2160        if (arg >= (piosize - IPATH_PIO_MAXIBHDR)) {
2161                /* Only if it's not the initial value (or reset to it) */
2162                if (piosize != dd->ipath_init_ibmaxlen) {
2163                        if (arg > piosize && arg <= dd->ipath_init_ibmaxlen)
2164                                piosize = dd->ipath_init_ibmaxlen;
2165                        dd->ipath_ibmaxlen = piosize;
2166                        changed = 1;
2167                }
2168        } else if ((arg + IPATH_PIO_MAXIBHDR) != dd->ipath_ibmaxlen) {
2169                piosize = arg + IPATH_PIO_MAXIBHDR;
2170                ipath_cdbg(VERBOSE, "ibmaxlen was 0x%x, setting to 0x%x "
2171                           "(mtu 0x%x)\n", dd->ipath_ibmaxlen, piosize,
2172                           arg);
2173                dd->ipath_ibmaxlen = piosize;
2174                changed = 1;
2175        }
2176
2177        if (changed) {
2178                u64 ibc = dd->ipath_ibcctrl, ibdw;
2179                /*
2180                 * update our housekeeping variables, and set IBC max
2181                 * size, same as init code; max IBC is max we allow in
2182                 * buffer, less the qword pbc, plus 1 for ICRC, in dwords
2183                 */
2184                dd->ipath_ibmaxlen = piosize - 2 * sizeof(u32);
2185                ibdw = (dd->ipath_ibmaxlen >> 2) + 1;
2186                ibc &= ~(INFINIPATH_IBCC_MAXPKTLEN_MASK <<
2187                         dd->ibcc_mpl_shift);
2188                ibc |= ibdw << dd->ibcc_mpl_shift;
2189                dd->ipath_ibcctrl = ibc;
2190                ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
2191                                 dd->ipath_ibcctrl);
2192                dd->ipath_f_tidtemplate(dd);
2193        }
2194
2195        ret = 0;
2196
2197bail:
2198        return ret;
2199}
2200
2201int ipath_set_lid(struct ipath_devdata *dd, u32 lid, u8 lmc)
2202{
2203        dd->ipath_lid = lid;
2204        dd->ipath_lmc = lmc;
2205
2206        dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LIDLMC, lid |
2207                (~((1U << lmc) - 1)) << 16);
2208
2209        dev_info(&dd->pcidev->dev, "We got a lid: 0x%x\n", lid);
2210
2211        return 0;
2212}
2213
2214
2215/**
2216 * ipath_write_kreg_port - write a device's per-port 64-bit kernel register
2217 * @dd: the infinipath device
2218 * @regno: the register number to write
2219 * @port: the port containing the register
2220 * @value: the value to write
2221 *
2222 * Registers that vary with the chip implementation constants (port)
2223 * use this routine.
2224 */
2225void ipath_write_kreg_port(const struct ipath_devdata *dd, ipath_kreg regno,
2226                          unsigned port, u64 value)
2227{
2228        u16 where;
2229
2230        if (port < dd->ipath_portcnt &&
2231            (regno == dd->ipath_kregs->kr_rcvhdraddr ||
2232             regno == dd->ipath_kregs->kr_rcvhdrtailaddr))
2233                where = regno + port;
2234        else
2235                where = -1;
2236
2237        ipath_write_kreg(dd, where, value);
2238}
2239
2240/*
2241 * Following deal with the "obviously simple" task of overriding the state
2242 * of the LEDS, which normally indicate link physical and logical status.
2243 * The complications arise in dealing with different hardware mappings
2244 * and the board-dependent routine being called from interrupts.
2245 * and then there's the requirement to _flash_ them.
2246 */
2247#define LED_OVER_FREQ_SHIFT 8
2248#define LED_OVER_FREQ_MASK (0xFF<<LED_OVER_FREQ_SHIFT)
2249/* Below is "non-zero" to force override, but both actual LEDs are off */
2250#define LED_OVER_BOTH_OFF (8)
2251
2252static void ipath_run_led_override(unsigned long opaque)
2253{
2254        struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
2255        int timeoff;
2256        int pidx;
2257        u64 lstate, ltstate, val;
2258
2259        if (!(dd->ipath_flags & IPATH_INITTED))
2260                return;
2261
2262        pidx = dd->ipath_led_override_phase++ & 1;
2263        dd->ipath_led_override = dd->ipath_led_override_vals[pidx];
2264        timeoff = dd->ipath_led_override_timeoff;
2265
2266        /*
2267         * below potentially restores the LED values per current status,
2268         * should also possibly setup the traffic-blink register,
2269         * but leave that to per-chip functions.
2270         */
2271        val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
2272        ltstate = ipath_ib_linktrstate(dd, val);
2273        lstate = ipath_ib_linkstate(dd, val);
2274
2275        dd->ipath_f_setextled(dd, lstate, ltstate);
2276        mod_timer(&dd->ipath_led_override_timer, jiffies + timeoff);
2277}
2278
2279void ipath_set_led_override(struct ipath_devdata *dd, unsigned int val)
2280{
2281        int timeoff, freq;
2282
2283        if (!(dd->ipath_flags & IPATH_INITTED))
2284                return;
2285
2286        /* First check if we are blinking. If not, use 1HZ polling */
2287        timeoff = HZ;
2288        freq = (val & LED_OVER_FREQ_MASK) >> LED_OVER_FREQ_SHIFT;
2289
2290        if (freq) {
2291                /* For blink, set each phase from one nybble of val */
2292                dd->ipath_led_override_vals[0] = val & 0xF;
2293                dd->ipath_led_override_vals[1] = (val >> 4) & 0xF;
2294                timeoff = (HZ << 4)/freq;
2295        } else {
2296                /* Non-blink set both phases the same. */
2297                dd->ipath_led_override_vals[0] = val & 0xF;
2298                dd->ipath_led_override_vals[1] = val & 0xF;
2299        }
2300        dd->ipath_led_override_timeoff = timeoff;
2301
2302        /*
2303         * If the timer has not already been started, do so. Use a "quick"
2304         * timeout so the function will be called soon, to look at our request.
2305         */
2306        if (atomic_inc_return(&dd->ipath_led_override_timer_active) == 1) {
2307                /* Need to start timer */
2308                init_timer(&dd->ipath_led_override_timer);
2309                dd->ipath_led_override_timer.function =
2310                                                 ipath_run_led_override;
2311                dd->ipath_led_override_timer.data = (unsigned long) dd;
2312                dd->ipath_led_override_timer.expires = jiffies + 1;
2313                add_timer(&dd->ipath_led_override_timer);
2314        } else
2315                atomic_dec(&dd->ipath_led_override_timer_active);
2316}
2317
2318/**
2319 * ipath_shutdown_device - shut down a device
2320 * @dd: the infinipath device
2321 *
2322 * This is called to make the device quiet when we are about to
2323 * unload the driver, and also when the device is administratively
2324 * disabled.   It does not free any data structures.
2325 * Everything it does has to be setup again by ipath_init_chip(dd,1)
2326 */
2327void ipath_shutdown_device(struct ipath_devdata *dd)
2328{
2329        unsigned long flags;
2330
2331        ipath_dbg("Shutting down the device\n");
2332
2333        ipath_hol_up(dd); /* make sure user processes aren't suspended */
2334
2335        dd->ipath_flags |= IPATH_LINKUNK;
2336        dd->ipath_flags &= ~(IPATH_INITTED | IPATH_LINKDOWN |
2337                             IPATH_LINKINIT | IPATH_LINKARMED |
2338                             IPATH_LINKACTIVE);
2339        *dd->ipath_statusp &= ~(IPATH_STATUS_IB_CONF |
2340                                IPATH_STATUS_IB_READY);
2341
2342        /* mask interrupts, but not errors */
2343        ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
2344
2345        dd->ipath_rcvctrl = 0;
2346        ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
2347                         dd->ipath_rcvctrl);
2348
2349        if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
2350                teardown_sdma(dd);
2351
2352        /*
2353         * gracefully stop all sends allowing any in progress to trickle out
2354         * first.
2355         */
2356        spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
2357        dd->ipath_sendctrl = 0;
2358        ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
2359        /* flush it */
2360        ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
2361        spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
2362
2363        /*
2364         * enough for anything that's going to trickle out to have actually
2365         * done so.
2366         */
2367        udelay(5);
2368
2369        dd->ipath_f_setextled(dd, 0, 0); /* make sure LEDs are off */
2370
2371        ipath_set_ib_lstate(dd, 0, INFINIPATH_IBCC_LINKINITCMD_DISABLE);
2372        ipath_cancel_sends(dd, 0);
2373
2374        /*
2375         * we are shutting down, so tell components that care.  We don't do
2376         * this on just a link state change, much like ethernet, a cable
2377         * unplug, etc. doesn't change driver state
2378         */
2379        signal_ib_event(dd, IB_EVENT_PORT_ERR);
2380
2381        /* disable IBC */
2382        dd->ipath_control &= ~INFINIPATH_C_LINKENABLE;
2383        ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
2384                         dd->ipath_control | INFINIPATH_C_FREEZEMODE);
2385
2386        /*
2387         * clear SerdesEnable and turn the leds off; do this here because
2388         * we are unloading, so don't count on interrupts to move along
2389         * Turn the LEDs off explicitly for the same reason.
2390         */
2391        dd->ipath_f_quiet_serdes(dd);
2392
2393        /* stop all the timers that might still be running */
2394        del_timer_sync(&dd->ipath_hol_timer);
2395        if (dd->ipath_stats_timer_active) {
2396                del_timer_sync(&dd->ipath_stats_timer);
2397                dd->ipath_stats_timer_active = 0;
2398        }
2399        if (dd->ipath_intrchk_timer.data) {
2400                del_timer_sync(&dd->ipath_intrchk_timer);
2401                dd->ipath_intrchk_timer.data = 0;
2402        }
2403        if (atomic_read(&dd->ipath_led_override_timer_active)) {
2404                del_timer_sync(&dd->ipath_led_override_timer);
2405                atomic_set(&dd->ipath_led_override_timer_active, 0);
2406        }
2407
2408        /*
2409         * clear all interrupts and errors, so that the next time the driver
2410         * is loaded or device is enabled, we know that whatever is set
2411         * happened while we were unloaded
2412         */
2413        ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
2414                         ~0ULL & ~INFINIPATH_HWE_MEMBISTFAILED);
2415        ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, -1LL);
2416        ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, -1LL);
2417
2418        ipath_cdbg(VERBOSE, "Flush time and errors to EEPROM\n");
2419        ipath_update_eeprom_log(dd);
2420}
2421
2422/**
2423 * ipath_free_pddata - free a port's allocated data
2424 * @dd: the infinipath device
2425 * @pd: the portdata structure
2426 *
2427 * free up any allocated data for a port
2428 * This should not touch anything that would affect a simultaneous
2429 * re-allocation of port data, because it is called after ipath_mutex
2430 * is released (and can be called from reinit as well).
2431 * It should never change any chip state, or global driver state.
2432 * (The only exception to global state is freeing the port0 port0_skbs.)
2433 */
2434void ipath_free_pddata(struct ipath_devdata *dd, struct ipath_portdata *pd)
2435{
2436        if (!pd)
2437                return;
2438
2439        if (pd->port_rcvhdrq) {
2440                ipath_cdbg(VERBOSE, "free closed port %d rcvhdrq @ %p "
2441                           "(size=%lu)\n", pd->port_port, pd->port_rcvhdrq,
2442                           (unsigned long) pd->port_rcvhdrq_size);
2443                dma_free_coherent(&dd->pcidev->dev, pd->port_rcvhdrq_size,
2444                                  pd->port_rcvhdrq, pd->port_rcvhdrq_phys);
2445                pd->port_rcvhdrq = NULL;
2446                if (pd->port_rcvhdrtail_kvaddr) {
2447                        dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
2448                                         pd->port_rcvhdrtail_kvaddr,
2449                                         pd->port_rcvhdrqtailaddr_phys);
2450                        pd->port_rcvhdrtail_kvaddr = NULL;
2451                }
2452        }
2453        if (pd->port_port && pd->port_rcvegrbuf) {
2454                unsigned e;
2455
2456                for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) {
2457                        void *base = pd->port_rcvegrbuf[e];
2458                        size_t size = pd->port_rcvegrbuf_size;
2459
2460                        ipath_cdbg(VERBOSE, "egrbuf free(%p, %lu), "
2461                                   "chunk %u/%u\n", base,
2462                                   (unsigned long) size,
2463                                   e, pd->port_rcvegrbuf_chunks);
2464                        dma_free_coherent(&dd->pcidev->dev, size,
2465                                base, pd->port_rcvegrbuf_phys[e]);
2466                }
2467                kfree(pd->port_rcvegrbuf);
2468                pd->port_rcvegrbuf = NULL;
2469                kfree(pd->port_rcvegrbuf_phys);
2470                pd->port_rcvegrbuf_phys = NULL;
2471                pd->port_rcvegrbuf_chunks = 0;
2472        } else if (pd->port_port == 0 && dd->ipath_port0_skbinfo) {
2473                unsigned e;
2474                struct ipath_skbinfo *skbinfo = dd->ipath_port0_skbinfo;
2475
2476                dd->ipath_port0_skbinfo = NULL;
2477                ipath_cdbg(VERBOSE, "free closed port %d "
2478                           "ipath_port0_skbinfo @ %p\n", pd->port_port,
2479                           skbinfo);
2480                for (e = 0; e < dd->ipath_p0_rcvegrcnt; e++)
2481                        if (skbinfo[e].skb) {
2482                                pci_unmap_single(dd->pcidev, skbinfo[e].phys,
2483                                                 dd->ipath_ibmaxlen,
2484                                                 PCI_DMA_FROMDEVICE);
2485                                dev_kfree_skb(skbinfo[e].skb);
2486                        }
2487                vfree(skbinfo);
2488        }
2489        kfree(pd->port_tid_pg_list);
2490        vfree(pd->subport_uregbase);
2491        vfree(pd->subport_rcvegrbuf);
2492        vfree(pd->subport_rcvhdr_base);
2493        kfree(pd);
2494}
2495
2496static int __init infinipath_init(void)
2497{
2498        int ret;
2499
2500        if (ipath_debug & __IPATH_DBG)
2501                printk(KERN_INFO DRIVER_LOAD_MSG "%s", ib_ipath_version);
2502
2503        /*
2504         * These must be called before the driver is registered with
2505         * the PCI subsystem.
2506         */
2507        idr_init(&unit_table);
2508        if (!idr_pre_get(&unit_table, GFP_KERNEL)) {
2509                printk(KERN_ERR IPATH_DRV_NAME ": idr_pre_get() failed\n");
2510                ret = -ENOMEM;
2511                goto bail;
2512        }
2513
2514        ret = pci_register_driver(&ipath_driver);
2515        if (ret < 0) {
2516                printk(KERN_ERR IPATH_DRV_NAME
2517                       ": Unable to register driver: error %d\n", -ret);
2518                goto bail_unit;
2519        }
2520
2521        ret = ipath_init_ipathfs();
2522        if (ret < 0) {
2523                printk(KERN_ERR IPATH_DRV_NAME ": Unable to create "
2524                       "ipathfs: error %d\n", -ret);
2525                goto bail_pci;
2526        }
2527
2528        goto bail;
2529
2530bail_pci:
2531        pci_unregister_driver(&ipath_driver);
2532
2533bail_unit:
2534        idr_destroy(&unit_table);
2535
2536bail:
2537        return ret;
2538}
2539
2540static void __exit infinipath_cleanup(void)
2541{
2542        ipath_exit_ipathfs();
2543
2544        ipath_cdbg(VERBOSE, "Unregistering pci driver\n");
2545        pci_unregister_driver(&ipath_driver);
2546
2547        idr_destroy(&unit_table);
2548}
2549
2550/**
2551 * ipath_reset_device - reset the chip if possible
2552 * @unit: the device to reset
2553 *
2554 * Whether or not reset is successful, we attempt to re-initialize the chip
2555 * (that is, much like a driver unload/reload).  We clear the INITTED flag
2556 * so that the various entry points will fail until we reinitialize.  For
2557 * now, we only allow this if no user ports are open that use chip resources
2558 */
2559int ipath_reset_device(int unit)
2560{
2561        int ret, i;
2562        struct ipath_devdata *dd = ipath_lookup(unit);
2563        unsigned long flags;
2564
2565        if (!dd) {
2566                ret = -ENODEV;
2567                goto bail;
2568        }
2569
2570        if (atomic_read(&dd->ipath_led_override_timer_active)) {
2571                /* Need to stop LED timer, _then_ shut off LEDs */
2572                del_timer_sync(&dd->ipath_led_override_timer);
2573                atomic_set(&dd->ipath_led_override_timer_active, 0);
2574        }
2575
2576        /* Shut off LEDs after we are sure timer is not running */
2577        dd->ipath_led_override = LED_OVER_BOTH_OFF;
2578        dd->ipath_f_setextled(dd, 0, 0);
2579
2580        dev_info(&dd->pcidev->dev, "Reset on unit %u requested\n", unit);
2581
2582        if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) {
2583                dev_info(&dd->pcidev->dev, "Invalid unit number %u or "
2584                         "not initialized or not present\n", unit);
2585                ret = -ENXIO;
2586                goto bail;
2587        }
2588
2589        spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
2590        if (dd->ipath_pd)
2591                for (i = 1; i < dd->ipath_cfgports; i++) {
2592                        if (!dd->ipath_pd[i] || !dd->ipath_pd[i]->port_cnt)
2593                                continue;
2594                        spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
2595                        ipath_dbg("unit %u port %d is in use "
2596                                  "(PID %u cmd %s), can't reset\n",
2597                                  unit, i,
2598                                  pid_nr(dd->ipath_pd[i]->port_pid),
2599                                  dd->ipath_pd[i]->port_comm);
2600                        ret = -EBUSY;
2601                        goto bail;
2602                }
2603        spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
2604
2605        if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
2606                teardown_sdma(dd);
2607
2608        dd->ipath_flags &= ~IPATH_INITTED;
2609        ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
2610        ret = dd->ipath_f_reset(dd);
2611        if (ret == 1) {
2612                ipath_dbg("Reinitializing unit %u after reset attempt\n",
2613                          unit);
2614                ret = ipath_init_chip(dd, 1);
2615        } else
2616                ret = -EAGAIN;
2617        if (ret)
2618                ipath_dev_err(dd, "Reinitialize unit %u after "
2619                              "reset failed with %d\n", unit, ret);
2620        else
2621                dev_info(&dd->pcidev->dev, "Reinitialized unit %u after "
2622                         "resetting\n", unit);
2623
2624bail:
2625        return ret;
2626}
2627
2628/*
2629 * send a signal to all the processes that have the driver open
2630 * through the normal interfaces (i.e., everything other than diags
2631 * interface).  Returns number of signalled processes.
2632 */
2633static int ipath_signal_procs(struct ipath_devdata *dd, int sig)
2634{
2635        int i, sub, any = 0;
2636        struct pid *pid;
2637        unsigned long flags;
2638
2639        if (!dd->ipath_pd)
2640                return 0;
2641
2642        spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
2643        for (i = 1; i < dd->ipath_cfgports; i++) {
2644                if (!dd->ipath_pd[i] || !dd->ipath_pd[i]->port_cnt)
2645                        continue;
2646                pid = dd->ipath_pd[i]->port_pid;
2647                if (!pid)
2648                        continue;
2649
2650                dev_info(&dd->pcidev->dev, "context %d in use "
2651                          "(PID %u), sending signal %d\n",
2652                          i, pid_nr(pid), sig);
2653                kill_pid(pid, sig, 1);
2654                any++;
2655                for (sub = 0; sub < INFINIPATH_MAX_SUBPORT; sub++) {
2656                        pid = dd->ipath_pd[i]->port_subpid[sub];
2657                        if (!pid)
2658                                continue;
2659                        dev_info(&dd->pcidev->dev, "sub-context "
2660                                "%d:%d in use (PID %u), sending "
2661                                "signal %d\n", i, sub, pid_nr(pid), sig);
2662                        kill_pid(pid, sig, 1);
2663                        any++;
2664                }
2665        }
2666        spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
2667        return any;
2668}
2669
2670static void ipath_hol_signal_down(struct ipath_devdata *dd)
2671{
2672        if (ipath_signal_procs(dd, SIGSTOP))
2673                ipath_dbg("Stopped some processes\n");
2674        ipath_cancel_sends(dd, 1);
2675}
2676
2677
2678static void ipath_hol_signal_up(struct ipath_devdata *dd)
2679{
2680        if (ipath_signal_procs(dd, SIGCONT))
2681                ipath_dbg("Continued some processes\n");
2682}
2683
2684/*
2685 * link is down, stop any users processes, and flush pending sends
2686 * to prevent HoL blocking, then start the HoL timer that
2687 * periodically continues, then stop procs, so they can detect
2688 * link down if they want, and do something about it.
2689 * Timer may already be running, so use mod_timer, not add_timer.
2690 */
2691void ipath_hol_down(struct ipath_devdata *dd)
2692{
2693        dd->ipath_hol_state = IPATH_HOL_DOWN;
2694        ipath_hol_signal_down(dd);
2695        dd->ipath_hol_next = IPATH_HOL_DOWNCONT;
2696        dd->ipath_hol_timer.expires = jiffies +
2697                msecs_to_jiffies(ipath_hol_timeout_ms);
2698        mod_timer(&dd->ipath_hol_timer, dd->ipath_hol_timer.expires);
2699}
2700
2701/*
2702 * link is up, continue any user processes, and ensure timer
2703 * is a nop, if running.  Let timer keep running, if set; it
2704 * will nop when it sees the link is up
2705 */
2706void ipath_hol_up(struct ipath_devdata *dd)
2707{
2708        ipath_hol_signal_up(dd);
2709        dd->ipath_hol_state = IPATH_HOL_UP;
2710}
2711
2712/*
2713 * toggle the running/not running state of user proceses
2714 * to prevent HoL blocking on chip resources, but still allow
2715 * user processes to do link down special case handling.
2716 * Should only be called via the timer
2717 */
2718void ipath_hol_event(unsigned long opaque)
2719{
2720        struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
2721
2722        if (dd->ipath_hol_next == IPATH_HOL_DOWNSTOP
2723                && dd->ipath_hol_state != IPATH_HOL_UP) {
2724                dd->ipath_hol_next = IPATH_HOL_DOWNCONT;
2725                ipath_dbg("Stopping processes\n");
2726                ipath_hol_signal_down(dd);
2727        } else { /* may do "extra" if also in ipath_hol_up() */
2728                dd->ipath_hol_next = IPATH_HOL_DOWNSTOP;
2729                ipath_dbg("Continuing processes\n");
2730                ipath_hol_signal_up(dd);
2731        }
2732        if (dd->ipath_hol_state == IPATH_HOL_UP)
2733                ipath_dbg("link's up, don't resched timer\n");
2734        else {
2735                dd->ipath_hol_timer.expires = jiffies +
2736                        msecs_to_jiffies(ipath_hol_timeout_ms);
2737                mod_timer(&dd->ipath_hol_timer,
2738                        dd->ipath_hol_timer.expires);
2739        }
2740}
2741
2742int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv)
2743{
2744        u64 val;
2745
2746        if (new_pol_inv > INFINIPATH_XGXS_RX_POL_MASK)
2747                return -1;
2748        if (dd->ipath_rx_pol_inv != new_pol_inv) {
2749                dd->ipath_rx_pol_inv = new_pol_inv;
2750                val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig);
2751                val &= ~(INFINIPATH_XGXS_RX_POL_MASK <<
2752                         INFINIPATH_XGXS_RX_POL_SHIFT);
2753                val |= ((u64)dd->ipath_rx_pol_inv) <<
2754                        INFINIPATH_XGXS_RX_POL_SHIFT;
2755                ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val);
2756        }
2757        return 0;
2758}
2759
2760/*
2761 * Disable and enable the armlaunch error.  Used for PIO bandwidth testing on
2762 * the 7220, which is count-based, rather than trigger-based.  Safe for the
2763 * driver check, since it's at init.   Not completely safe when used for
2764 * user-mode checking, since some error checking can be lost, but not
2765 * particularly risky, and only has problematic side-effects in the face of
2766 * very buggy user code.  There is no reference counting, but that's also
2767 * fine, given the intended use.
2768 */
2769void ipath_enable_armlaunch(struct ipath_devdata *dd)
2770{
2771        dd->ipath_lasterror &= ~INFINIPATH_E_SPIOARMLAUNCH;
2772        ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear,
2773                INFINIPATH_E_SPIOARMLAUNCH);
2774        dd->ipath_errormask |= INFINIPATH_E_SPIOARMLAUNCH;
2775        ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
2776                dd->ipath_errormask);
2777}
2778
2779void ipath_disable_armlaunch(struct ipath_devdata *dd)
2780{
2781        /* so don't re-enable if already set */
2782        dd->ipath_maskederrs &= ~INFINIPATH_E_SPIOARMLAUNCH;
2783        dd->ipath_errormask &= ~INFINIPATH_E_SPIOARMLAUNCH;
2784        ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
2785                dd->ipath_errormask);
2786}
2787
2788module_init(infinipath_init);
2789module_exit(infinipath_cleanup);
2790