linux/drivers/infiniband/hw/ipath/ipath_file_ops.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
   3 * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
   4 *
   5 * This software is available to you under a choice of one of two
   6 * licenses.  You may choose to be licensed under the terms of the GNU
   7 * General Public License (GPL) Version 2, available from the file
   8 * COPYING in the main directory of this source tree, or the
   9 * OpenIB.org BSD license below:
  10 *
  11 *     Redistribution and use in source and binary forms, with or
  12 *     without modification, are permitted provided that the following
  13 *     conditions are met:
  14 *
  15 *      - Redistributions of source code must retain the above
  16 *        copyright notice, this list of conditions and the following
  17 *        disclaimer.
  18 *
  19 *      - Redistributions in binary form must reproduce the above
  20 *        copyright notice, this list of conditions and the following
  21 *        disclaimer in the documentation and/or other materials
  22 *        provided with the distribution.
  23 *
  24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  31 * SOFTWARE.
  32 */
  33
  34#include <linux/pci.h>
  35#include <linux/poll.h>
  36#include <linux/cdev.h>
  37#include <linux/swap.h>
  38#include <linux/export.h>
  39#include <linux/vmalloc.h>
  40#include <linux/slab.h>
  41#include <linux/highmem.h>
  42#include <linux/io.h>
  43#include <linux/jiffies.h>
  44#include <linux/cpu.h>
  45#include <asm/pgtable.h>
  46
  47#include "ipath_kernel.h"
  48#include "ipath_common.h"
  49#include "ipath_user_sdma.h"
  50
  51static int ipath_open(struct inode *, struct file *);
  52static int ipath_close(struct inode *, struct file *);
  53static ssize_t ipath_write(struct file *, const char __user *, size_t,
  54                           loff_t *);
  55static ssize_t ipath_writev(struct kiocb *, const struct iovec *,
  56                            unsigned long , loff_t);
  57static unsigned int ipath_poll(struct file *, struct poll_table_struct *);
  58static int ipath_mmap(struct file *, struct vm_area_struct *);
  59
  60static const struct file_operations ipath_file_ops = {
  61        .owner = THIS_MODULE,
  62        .write = ipath_write,
  63        .aio_write = ipath_writev,
  64        .open = ipath_open,
  65        .release = ipath_close,
  66        .poll = ipath_poll,
  67        .mmap = ipath_mmap,
  68        .llseek = noop_llseek,
  69};
  70
  71/*
  72 * Convert kernel virtual addresses to physical addresses so they don't
  73 * potentially conflict with the chip addresses used as mmap offsets.
  74 * It doesn't really matter what mmap offset we use as long as we can
  75 * interpret it correctly.
  76 */
  77static u64 cvt_kvaddr(void *p)
  78{
  79        struct page *page;
  80        u64 paddr = 0;
  81
  82        page = vmalloc_to_page(p);
  83        if (page)
  84                paddr = page_to_pfn(page) << PAGE_SHIFT;
  85
  86        return paddr;
  87}
  88
  89static int ipath_get_base_info(struct file *fp,
  90                               void __user *ubase, size_t ubase_size)
  91{
  92        struct ipath_portdata *pd = port_fp(fp);
  93        int ret = 0;
  94        struct ipath_base_info *kinfo = NULL;
  95        struct ipath_devdata *dd = pd->port_dd;
  96        unsigned subport_cnt;
  97        int shared, master;
  98        size_t sz;
  99
 100        subport_cnt = pd->port_subport_cnt;
 101        if (!subport_cnt) {
 102                shared = 0;
 103                master = 0;
 104                subport_cnt = 1;
 105        } else {
 106                shared = 1;
 107                master = !subport_fp(fp);
 108        }
 109
 110        sz = sizeof(*kinfo);
 111        /* If port sharing is not requested, allow the old size structure */
 112        if (!shared)
 113                sz -= 7 * sizeof(u64);
 114        if (ubase_size < sz) {
 115                ipath_cdbg(PROC,
 116                           "Base size %zu, need %zu (version mismatch?)\n",
 117                           ubase_size, sz);
 118                ret = -EINVAL;
 119                goto bail;
 120        }
 121
 122        kinfo = kzalloc(sizeof(*kinfo), GFP_KERNEL);
 123        if (kinfo == NULL) {
 124                ret = -ENOMEM;
 125                goto bail;
 126        }
 127
 128        ret = dd->ipath_f_get_base_info(pd, kinfo);
 129        if (ret < 0)
 130                goto bail;
 131
 132        kinfo->spi_rcvhdr_cnt = dd->ipath_rcvhdrcnt;
 133        kinfo->spi_rcvhdrent_size = dd->ipath_rcvhdrentsize;
 134        kinfo->spi_tidegrcnt = dd->ipath_rcvegrcnt;
 135        kinfo->spi_rcv_egrbufsize = dd->ipath_rcvegrbufsize;
 136        /*
 137         * have to mmap whole thing
 138         */
 139        kinfo->spi_rcv_egrbuftotlen =
 140                pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size;
 141        kinfo->spi_rcv_egrperchunk = pd->port_rcvegrbufs_perchunk;
 142        kinfo->spi_rcv_egrchunksize = kinfo->spi_rcv_egrbuftotlen /
 143                pd->port_rcvegrbuf_chunks;
 144        kinfo->spi_tidcnt = dd->ipath_rcvtidcnt / subport_cnt;
 145        if (master)
 146                kinfo->spi_tidcnt += dd->ipath_rcvtidcnt % subport_cnt;
 147        /*
 148         * for this use, may be ipath_cfgports summed over all chips that
 149         * are are configured and present
 150         */
 151        kinfo->spi_nports = dd->ipath_cfgports;
 152        /* unit (chip/board) our port is on */
 153        kinfo->spi_unit = dd->ipath_unit;
 154        /* for now, only a single page */
 155        kinfo->spi_tid_maxsize = PAGE_SIZE;
 156
 157        /*
 158         * Doing this per port, and based on the skip value, etc.  This has
 159         * to be the actual buffer size, since the protocol code treats it
 160         * as an array.
 161         *
 162         * These have to be set to user addresses in the user code via mmap.
 163         * These values are used on return to user code for the mmap target
 164         * addresses only.  For 32 bit, same 44 bit address problem, so use
 165         * the physical address, not virtual.  Before 2.6.11, using the
 166         * page_address() macro worked, but in 2.6.11, even that returns the
 167         * full 64 bit address (upper bits all 1's).  So far, using the
 168         * physical addresses (or chip offsets, for chip mapping) works, but
 169         * no doubt some future kernel release will change that, and we'll be
 170         * on to yet another method of dealing with this.
 171         */
 172        kinfo->spi_rcvhdr_base = (u64) pd->port_rcvhdrq_phys;
 173        kinfo->spi_rcvhdr_tailaddr = (u64) pd->port_rcvhdrqtailaddr_phys;
 174        kinfo->spi_rcv_egrbufs = (u64) pd->port_rcvegr_phys;
 175        kinfo->spi_pioavailaddr = (u64) dd->ipath_pioavailregs_phys;
 176        kinfo->spi_status = (u64) kinfo->spi_pioavailaddr +
 177                (void *) dd->ipath_statusp -
 178                (void *) dd->ipath_pioavailregs_dma;
 179        if (!shared) {
 180                kinfo->spi_piocnt = pd->port_piocnt;
 181                kinfo->spi_piobufbase = (u64) pd->port_piobufs;
 182                kinfo->__spi_uregbase = (u64) dd->ipath_uregbase +
 183                        dd->ipath_ureg_align * pd->port_port;
 184        } else if (master) {
 185                kinfo->spi_piocnt = (pd->port_piocnt / subport_cnt) +
 186                                    (pd->port_piocnt % subport_cnt);
 187                /* Master's PIO buffers are after all the slave's */
 188                kinfo->spi_piobufbase = (u64) pd->port_piobufs +
 189                        dd->ipath_palign *
 190                        (pd->port_piocnt - kinfo->spi_piocnt);
 191        } else {
 192                unsigned slave = subport_fp(fp) - 1;
 193
 194                kinfo->spi_piocnt = pd->port_piocnt / subport_cnt;
 195                kinfo->spi_piobufbase = (u64) pd->port_piobufs +
 196                        dd->ipath_palign * kinfo->spi_piocnt * slave;
 197        }
 198
 199        if (shared) {
 200                kinfo->spi_port_uregbase = (u64) dd->ipath_uregbase +
 201                        dd->ipath_ureg_align * pd->port_port;
 202                kinfo->spi_port_rcvegrbuf = kinfo->spi_rcv_egrbufs;
 203                kinfo->spi_port_rcvhdr_base = kinfo->spi_rcvhdr_base;
 204                kinfo->spi_port_rcvhdr_tailaddr = kinfo->spi_rcvhdr_tailaddr;
 205
 206                kinfo->__spi_uregbase = cvt_kvaddr(pd->subport_uregbase +
 207                        PAGE_SIZE * subport_fp(fp));
 208
 209                kinfo->spi_rcvhdr_base = cvt_kvaddr(pd->subport_rcvhdr_base +
 210                        pd->port_rcvhdrq_size * subport_fp(fp));
 211                kinfo->spi_rcvhdr_tailaddr = 0;
 212                kinfo->spi_rcv_egrbufs = cvt_kvaddr(pd->subport_rcvegrbuf +
 213                        pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size *
 214                        subport_fp(fp));
 215
 216                kinfo->spi_subport_uregbase =
 217                        cvt_kvaddr(pd->subport_uregbase);
 218                kinfo->spi_subport_rcvegrbuf =
 219                        cvt_kvaddr(pd->subport_rcvegrbuf);
 220                kinfo->spi_subport_rcvhdr_base =
 221                        cvt_kvaddr(pd->subport_rcvhdr_base);
 222                ipath_cdbg(PROC, "port %u flags %x %llx %llx %llx\n",
 223                        kinfo->spi_port, kinfo->spi_runtime_flags,
 224                        (unsigned long long) kinfo->spi_subport_uregbase,
 225                        (unsigned long long) kinfo->spi_subport_rcvegrbuf,
 226                        (unsigned long long) kinfo->spi_subport_rcvhdr_base);
 227        }
 228
 229        /*
 230         * All user buffers are 2KB buffers.  If we ever support
 231         * giving 4KB buffers to user processes, this will need some
 232         * work.
 233         */
 234        kinfo->spi_pioindex = (kinfo->spi_piobufbase -
 235                (dd->ipath_piobufbase & 0xffffffff)) / dd->ipath_palign;
 236        kinfo->spi_pioalign = dd->ipath_palign;
 237
 238        kinfo->spi_qpair = IPATH_KD_QP;
 239        /*
 240         * user mode PIO buffers are always 2KB, even when 4KB can
 241         * be received, and sent via the kernel; this is ibmaxlen
 242         * for 2K MTU.
 243         */
 244        kinfo->spi_piosize = dd->ipath_piosize2k - 2 * sizeof(u32);
 245        kinfo->spi_mtu = dd->ipath_ibmaxlen;    /* maxlen, not ibmtu */
 246        kinfo->spi_port = pd->port_port;
 247        kinfo->spi_subport = subport_fp(fp);
 248        kinfo->spi_sw_version = IPATH_KERN_SWVERSION;
 249        kinfo->spi_hw_version = dd->ipath_revision;
 250
 251        if (master) {
 252                kinfo->spi_runtime_flags |= IPATH_RUNTIME_MASTER;
 253        }
 254
 255        sz = (ubase_size < sizeof(*kinfo)) ? ubase_size : sizeof(*kinfo);
 256        if (copy_to_user(ubase, kinfo, sz))
 257                ret = -EFAULT;
 258
 259bail:
 260        kfree(kinfo);
 261        return ret;
 262}
 263
 264/**
 265 * ipath_tid_update - update a port TID
 266 * @pd: the port
 267 * @fp: the ipath device file
 268 * @ti: the TID information
 269 *
 270 * The new implementation as of Oct 2004 is that the driver assigns
 271 * the tid and returns it to the caller.   To make it easier to
 272 * catch bugs, and to reduce search time, we keep a cursor for
 273 * each port, walking the shadow tid array to find one that's not
 274 * in use.
 275 *
 276 * For now, if we can't allocate the full list, we fail, although
 277 * in the long run, we'll allocate as many as we can, and the
 278 * caller will deal with that by trying the remaining pages later.
 279 * That means that when we fail, we have to mark the tids as not in
 280 * use again, in our shadow copy.
 281 *
 282 * It's up to the caller to free the tids when they are done.
 283 * We'll unlock the pages as they free them.
 284 *
 285 * Also, right now we are locking one page at a time, but since
 286 * the intended use of this routine is for a single group of
 287 * virtually contiguous pages, that should change to improve
 288 * performance.
 289 */
 290static int ipath_tid_update(struct ipath_portdata *pd, struct file *fp,
 291                            const struct ipath_tid_info *ti)
 292{
 293        int ret = 0, ntids;
 294        u32 tid, porttid, cnt, i, tidcnt, tidoff;
 295        u16 *tidlist;
 296        struct ipath_devdata *dd = pd->port_dd;
 297        u64 physaddr;
 298        unsigned long vaddr;
 299        u64 __iomem *tidbase;
 300        unsigned long tidmap[8];
 301        struct page **pagep = NULL;
 302        unsigned subport = subport_fp(fp);
 303
 304        if (!dd->ipath_pageshadow) {
 305                ret = -ENOMEM;
 306                goto done;
 307        }
 308
 309        cnt = ti->tidcnt;
 310        if (!cnt) {
 311                ipath_dbg("After copyin, tidcnt 0, tidlist %llx\n",
 312                          (unsigned long long) ti->tidlist);
 313                /*
 314                 * Should we treat as success?  likely a bug
 315                 */
 316                ret = -EFAULT;
 317                goto done;
 318        }
 319        porttid = pd->port_port * dd->ipath_rcvtidcnt;
 320        if (!pd->port_subport_cnt) {
 321                tidcnt = dd->ipath_rcvtidcnt;
 322                tid = pd->port_tidcursor;
 323                tidoff = 0;
 324        } else if (!subport) {
 325                tidcnt = (dd->ipath_rcvtidcnt / pd->port_subport_cnt) +
 326                         (dd->ipath_rcvtidcnt % pd->port_subport_cnt);
 327                tidoff = dd->ipath_rcvtidcnt - tidcnt;
 328                porttid += tidoff;
 329                tid = tidcursor_fp(fp);
 330        } else {
 331                tidcnt = dd->ipath_rcvtidcnt / pd->port_subport_cnt;
 332                tidoff = tidcnt * (subport - 1);
 333                porttid += tidoff;
 334                tid = tidcursor_fp(fp);
 335        }
 336        if (cnt > tidcnt) {
 337                /* make sure it all fits in port_tid_pg_list */
 338                dev_info(&dd->pcidev->dev, "Process tried to allocate %u "
 339                         "TIDs, only trying max (%u)\n", cnt, tidcnt);
 340                cnt = tidcnt;
 341        }
 342        pagep = &((struct page **) pd->port_tid_pg_list)[tidoff];
 343        tidlist = &((u16 *) &pagep[dd->ipath_rcvtidcnt])[tidoff];
 344
 345        memset(tidmap, 0, sizeof(tidmap));
 346        /* before decrement; chip actual # */
 347        ntids = tidcnt;
 348        tidbase = (u64 __iomem *) (((char __iomem *) dd->ipath_kregbase) +
 349                                   dd->ipath_rcvtidbase +
 350                                   porttid * sizeof(*tidbase));
 351
 352        ipath_cdbg(VERBOSE, "Port%u %u tids, cursor %u, tidbase %p\n",
 353                   pd->port_port, cnt, tid, tidbase);
 354
 355        /* virtual address of first page in transfer */
 356        vaddr = ti->tidvaddr;
 357        if (!access_ok(VERIFY_WRITE, (void __user *) vaddr,
 358                       cnt * PAGE_SIZE)) {
 359                ipath_dbg("Fail vaddr %p, %u pages, !access_ok\n",
 360                          (void *)vaddr, cnt);
 361                ret = -EFAULT;
 362                goto done;
 363        }
 364        ret = ipath_get_user_pages(vaddr, cnt, pagep);
 365        if (ret) {
 366                if (ret == -EBUSY) {
 367                        ipath_dbg("Failed to lock addr %p, %u pages "
 368                                  "(already locked)\n",
 369                                  (void *) vaddr, cnt);
 370                        /*
 371                         * for now, continue, and see what happens but with
 372                         * the new implementation, this should never happen,
 373                         * unless perhaps the user has mpin'ed the pages
 374                         * themselves (something we need to test)
 375                         */
 376                        ret = 0;
 377                } else {
 378                        dev_info(&dd->pcidev->dev,
 379                                 "Failed to lock addr %p, %u pages: "
 380                                 "errno %d\n", (void *) vaddr, cnt, -ret);
 381                        goto done;
 382                }
 383        }
 384        for (i = 0; i < cnt; i++, vaddr += PAGE_SIZE) {
 385                for (; ntids--; tid++) {
 386                        if (tid == tidcnt)
 387                                tid = 0;
 388                        if (!dd->ipath_pageshadow[porttid + tid])
 389                                break;
 390                }
 391                if (ntids < 0) {
 392                        /*
 393                         * oops, wrapped all the way through their TIDs,
 394                         * and didn't have enough free; see comments at
 395                         * start of routine
 396                         */
 397                        ipath_dbg("Not enough free TIDs for %u pages "
 398                                  "(index %d), failing\n", cnt, i);
 399                        i--;    /* last tidlist[i] not filled in */
 400                        ret = -ENOMEM;
 401                        break;
 402                }
 403                tidlist[i] = tid + tidoff;
 404                ipath_cdbg(VERBOSE, "Updating idx %u to TID %u, "
 405                           "vaddr %lx\n", i, tid + tidoff, vaddr);
 406                /* we "know" system pages and TID pages are same size */
 407                dd->ipath_pageshadow[porttid + tid] = pagep[i];
 408                dd->ipath_physshadow[porttid + tid] = ipath_map_page(
 409                        dd->pcidev, pagep[i], 0, PAGE_SIZE,
 410                        PCI_DMA_FROMDEVICE);
 411                /*
 412                 * don't need atomic or it's overhead
 413                 */
 414                __set_bit(tid, tidmap);
 415                physaddr = dd->ipath_physshadow[porttid + tid];
 416                ipath_stats.sps_pagelocks++;
 417                ipath_cdbg(VERBOSE,
 418                           "TID %u, vaddr %lx, physaddr %llx pgp %p\n",
 419                           tid, vaddr, (unsigned long long) physaddr,
 420                           pagep[i]);
 421                dd->ipath_f_put_tid(dd, &tidbase[tid], RCVHQ_RCV_TYPE_EXPECTED,
 422                                    physaddr);
 423                /*
 424                 * don't check this tid in ipath_portshadow, since we
 425                 * just filled it in; start with the next one.
 426                 */
 427                tid++;
 428        }
 429
 430        if (ret) {
 431                u32 limit;
 432        cleanup:
 433                /* jump here if copy out of updated info failed... */
 434                ipath_dbg("After failure (ret=%d), undo %d of %d entries\n",
 435                          -ret, i, cnt);
 436                /* same code that's in ipath_free_tid() */
 437                limit = sizeof(tidmap) * BITS_PER_BYTE;
 438                if (limit > tidcnt)
 439                        /* just in case size changes in future */
 440                        limit = tidcnt;
 441                tid = find_first_bit((const unsigned long *)tidmap, limit);
 442                for (; tid < limit; tid++) {
 443                        if (!test_bit(tid, tidmap))
 444                                continue;
 445                        if (dd->ipath_pageshadow[porttid + tid]) {
 446                                ipath_cdbg(VERBOSE, "Freeing TID %u\n",
 447                                           tid);
 448                                dd->ipath_f_put_tid(dd, &tidbase[tid],
 449                                                    RCVHQ_RCV_TYPE_EXPECTED,
 450                                                    dd->ipath_tidinvalid);
 451                                pci_unmap_page(dd->pcidev,
 452                                        dd->ipath_physshadow[porttid + tid],
 453                                        PAGE_SIZE, PCI_DMA_FROMDEVICE);
 454                                dd->ipath_pageshadow[porttid + tid] = NULL;
 455                                ipath_stats.sps_pageunlocks++;
 456                        }
 457                }
 458                ipath_release_user_pages(pagep, cnt);
 459        } else {
 460                /*
 461                 * Copy the updated array, with ipath_tid's filled in, back
 462                 * to user.  Since we did the copy in already, this "should
 463                 * never fail" If it does, we have to clean up...
 464                 */
 465                if (copy_to_user((void __user *)
 466                                 (unsigned long) ti->tidlist,
 467                                 tidlist, cnt * sizeof(*tidlist))) {
 468                        ret = -EFAULT;
 469                        goto cleanup;
 470                }
 471                if (copy_to_user((void __user *) (unsigned long) ti->tidmap,
 472                                 tidmap, sizeof tidmap)) {
 473                        ret = -EFAULT;
 474                        goto cleanup;
 475                }
 476                if (tid == tidcnt)
 477                        tid = 0;
 478                if (!pd->port_subport_cnt)
 479                        pd->port_tidcursor = tid;
 480                else
 481                        tidcursor_fp(fp) = tid;
 482        }
 483
 484done:
 485        if (ret)
 486                ipath_dbg("Failed to map %u TID pages, failing with %d\n",
 487                          ti->tidcnt, -ret);
 488        return ret;
 489}
 490
 491/**
 492 * ipath_tid_free - free a port TID
 493 * @pd: the port
 494 * @subport: the subport
 495 * @ti: the TID info
 496 *
 497 * right now we are unlocking one page at a time, but since
 498 * the intended use of this routine is for a single group of
 499 * virtually contiguous pages, that should change to improve
 500 * performance.  We check that the TID is in range for this port
 501 * but otherwise don't check validity; if user has an error and
 502 * frees the wrong tid, it's only their own data that can thereby
 503 * be corrupted.  We do check that the TID was in use, for sanity
 504 * We always use our idea of the saved address, not the address that
 505 * they pass in to us.
 506 */
 507
 508static int ipath_tid_free(struct ipath_portdata *pd, unsigned subport,
 509                          const struct ipath_tid_info *ti)
 510{
 511        int ret = 0;
 512        u32 tid, porttid, cnt, limit, tidcnt;
 513        struct ipath_devdata *dd = pd->port_dd;
 514        u64 __iomem *tidbase;
 515        unsigned long tidmap[8];
 516
 517        if (!dd->ipath_pageshadow) {
 518                ret = -ENOMEM;
 519                goto done;
 520        }
 521
 522        if (copy_from_user(tidmap, (void __user *)(unsigned long)ti->tidmap,
 523                           sizeof tidmap)) {
 524                ret = -EFAULT;
 525                goto done;
 526        }
 527
 528        porttid = pd->port_port * dd->ipath_rcvtidcnt;
 529        if (!pd->port_subport_cnt)
 530                tidcnt = dd->ipath_rcvtidcnt;
 531        else if (!subport) {
 532                tidcnt = (dd->ipath_rcvtidcnt / pd->port_subport_cnt) +
 533                         (dd->ipath_rcvtidcnt % pd->port_subport_cnt);
 534                porttid += dd->ipath_rcvtidcnt - tidcnt;
 535        } else {
 536                tidcnt = dd->ipath_rcvtidcnt / pd->port_subport_cnt;
 537                porttid += tidcnt * (subport - 1);
 538        }
 539        tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) +
 540                                   dd->ipath_rcvtidbase +
 541                                   porttid * sizeof(*tidbase));
 542
 543        limit = sizeof(tidmap) * BITS_PER_BYTE;
 544        if (limit > tidcnt)
 545                /* just in case size changes in future */
 546                limit = tidcnt;
 547        tid = find_first_bit(tidmap, limit);
 548        ipath_cdbg(VERBOSE, "Port%u free %u tids; first bit (max=%d) "
 549                   "set is %d, porttid %u\n", pd->port_port, ti->tidcnt,
 550                   limit, tid, porttid);
 551        for (cnt = 0; tid < limit; tid++) {
 552                /*
 553                 * small optimization; if we detect a run of 3 or so without
 554                 * any set, use find_first_bit again.  That's mainly to
 555                 * accelerate the case where we wrapped, so we have some at
 556                 * the beginning, and some at the end, and a big gap
 557                 * in the middle.
 558                 */
 559                if (!test_bit(tid, tidmap))
 560                        continue;
 561                cnt++;
 562                if (dd->ipath_pageshadow[porttid + tid]) {
 563                        struct page *p;
 564                        p = dd->ipath_pageshadow[porttid + tid];
 565                        dd->ipath_pageshadow[porttid + tid] = NULL;
 566                        ipath_cdbg(VERBOSE, "PID %u freeing TID %u\n",
 567                                   pid_nr(pd->port_pid), tid);
 568                        dd->ipath_f_put_tid(dd, &tidbase[tid],
 569                                            RCVHQ_RCV_TYPE_EXPECTED,
 570                                            dd->ipath_tidinvalid);
 571                        pci_unmap_page(dd->pcidev,
 572                                dd->ipath_physshadow[porttid + tid],
 573                                PAGE_SIZE, PCI_DMA_FROMDEVICE);
 574                        ipath_release_user_pages(&p, 1);
 575                        ipath_stats.sps_pageunlocks++;
 576                } else
 577                        ipath_dbg("Unused tid %u, ignoring\n", tid);
 578        }
 579        if (cnt != ti->tidcnt)
 580                ipath_dbg("passed in tidcnt %d, only %d bits set in map\n",
 581                          ti->tidcnt, cnt);
 582done:
 583        if (ret)
 584                ipath_dbg("Failed to unmap %u TID pages, failing with %d\n",
 585                          ti->tidcnt, -ret);
 586        return ret;
 587}
 588
 589/**
 590 * ipath_set_part_key - set a partition key
 591 * @pd: the port
 592 * @key: the key
 593 *
 594 * We can have up to 4 active at a time (other than the default, which is
 595 * always allowed).  This is somewhat tricky, since multiple ports may set
 596 * the same key, so we reference count them, and clean up at exit.  All 4
 597 * partition keys are packed into a single infinipath register.  It's an
 598 * error for a process to set the same pkey multiple times.  We provide no
 599 * mechanism to de-allocate a pkey at this time, we may eventually need to
 600 * do that.  I've used the atomic operations, and no locking, and only make
 601 * a single pass through what's available.  This should be more than
 602 * adequate for some time. I'll think about spinlocks or the like if and as
 603 * it's necessary.
 604 */
 605static int ipath_set_part_key(struct ipath_portdata *pd, u16 key)
 606{
 607        struct ipath_devdata *dd = pd->port_dd;
 608        int i, any = 0, pidx = -1;
 609        u16 lkey = key & 0x7FFF;
 610        int ret;
 611
 612        if (lkey == (IPATH_DEFAULT_P_KEY & 0x7FFF)) {
 613                /* nothing to do; this key always valid */
 614                ret = 0;
 615                goto bail;
 616        }
 617
 618        ipath_cdbg(VERBOSE, "p%u try to set pkey %hx, current keys "
 619                   "%hx:%x %hx:%x %hx:%x %hx:%x\n",
 620                   pd->port_port, key, dd->ipath_pkeys[0],
 621                   atomic_read(&dd->ipath_pkeyrefs[0]), dd->ipath_pkeys[1],
 622                   atomic_read(&dd->ipath_pkeyrefs[1]), dd->ipath_pkeys[2],
 623                   atomic_read(&dd->ipath_pkeyrefs[2]), dd->ipath_pkeys[3],
 624                   atomic_read(&dd->ipath_pkeyrefs[3]));
 625
 626        if (!lkey) {
 627                ipath_cdbg(PROC, "p%u tries to set key 0, not allowed\n",
 628                           pd->port_port);
 629                ret = -EINVAL;
 630                goto bail;
 631        }
 632
 633        /*
 634         * Set the full membership bit, because it has to be
 635         * set in the register or the packet, and it seems
 636         * cleaner to set in the register than to force all
 637         * callers to set it. (see bug 4331)
 638         */
 639        key |= 0x8000;
 640
 641        for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) {
 642                if (!pd->port_pkeys[i] && pidx == -1)
 643                        pidx = i;
 644                if (pd->port_pkeys[i] == key) {
 645                        ipath_cdbg(VERBOSE, "p%u tries to set same pkey "
 646                                   "(%x) more than once\n",
 647                                   pd->port_port, key);
 648                        ret = -EEXIST;
 649                        goto bail;
 650                }
 651        }
 652        if (pidx == -1) {
 653                ipath_dbg("All pkeys for port %u already in use, "
 654                          "can't set %x\n", pd->port_port, key);
 655                ret = -EBUSY;
 656                goto bail;
 657        }
 658        for (any = i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
 659                if (!dd->ipath_pkeys[i]) {
 660                        any++;
 661                        continue;
 662                }
 663                if (dd->ipath_pkeys[i] == key) {
 664                        atomic_t *pkrefs = &dd->ipath_pkeyrefs[i];
 665
 666                        if (atomic_inc_return(pkrefs) > 1) {
 667                                pd->port_pkeys[pidx] = key;
 668                                ipath_cdbg(VERBOSE, "p%u set key %x "
 669                                           "matches #%d, count now %d\n",
 670                                           pd->port_port, key, i,
 671                                           atomic_read(pkrefs));
 672                                ret = 0;
 673                                goto bail;
 674                        } else {
 675                                /*
 676                                 * lost race, decrement count, catch below
 677                                 */
 678                                atomic_dec(pkrefs);
 679                                ipath_cdbg(VERBOSE, "Lost race, count was "
 680                                           "0, after dec, it's %d\n",
 681                                           atomic_read(pkrefs));
 682                                any++;
 683                        }
 684                }
 685                if ((dd->ipath_pkeys[i] & 0x7FFF) == lkey) {
 686                        /*
 687                         * It makes no sense to have both the limited and
 688                         * full membership PKEY set at the same time since
 689                         * the unlimited one will disable the limited one.
 690                         */
 691                        ret = -EEXIST;
 692                        goto bail;
 693                }
 694        }
 695        if (!any) {
 696                ipath_dbg("port %u, all pkeys already in use, "
 697                          "can't set %x\n", pd->port_port, key);
 698                ret = -EBUSY;
 699                goto bail;
 700        }
 701        for (any = i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
 702                if (!dd->ipath_pkeys[i] &&
 703                    atomic_inc_return(&dd->ipath_pkeyrefs[i]) == 1) {
 704                        u64 pkey;
 705
 706                        /* for ipathstats, etc. */
 707                        ipath_stats.sps_pkeys[i] = lkey;
 708                        pd->port_pkeys[pidx] = dd->ipath_pkeys[i] = key;
 709                        pkey =
 710                                (u64) dd->ipath_pkeys[0] |
 711                                ((u64) dd->ipath_pkeys[1] << 16) |
 712                                ((u64) dd->ipath_pkeys[2] << 32) |
 713                                ((u64) dd->ipath_pkeys[3] << 48);
 714                        ipath_cdbg(PROC, "p%u set key %x in #%d, "
 715                                   "portidx %d, new pkey reg %llx\n",
 716                                   pd->port_port, key, i, pidx,
 717                                   (unsigned long long) pkey);
 718                        ipath_write_kreg(
 719                                dd, dd->ipath_kregs->kr_partitionkey, pkey);
 720
 721                        ret = 0;
 722                        goto bail;
 723                }
 724        }
 725        ipath_dbg("port %u, all pkeys already in use 2nd pass, "
 726                  "can't set %x\n", pd->port_port, key);
 727        ret = -EBUSY;
 728
 729bail:
 730        return ret;
 731}
 732
 733/**
 734 * ipath_manage_rcvq - manage a port's receive queue
 735 * @pd: the port
 736 * @subport: the subport
 737 * @start_stop: action to carry out
 738 *
 739 * start_stop == 0 disables receive on the port, for use in queue
 740 * overflow conditions.  start_stop==1 re-enables, to be used to
 741 * re-init the software copy of the head register
 742 */
 743static int ipath_manage_rcvq(struct ipath_portdata *pd, unsigned subport,
 744                             int start_stop)
 745{
 746        struct ipath_devdata *dd = pd->port_dd;
 747
 748        ipath_cdbg(PROC, "%sabling rcv for unit %u port %u:%u\n",
 749                   start_stop ? "en" : "dis", dd->ipath_unit,
 750                   pd->port_port, subport);
 751        if (subport)
 752                goto bail;
 753        /* atomically clear receive enable port. */
 754        if (start_stop) {
 755                /*
 756                 * On enable, force in-memory copy of the tail register to
 757                 * 0, so that protocol code doesn't have to worry about
 758                 * whether or not the chip has yet updated the in-memory
 759                 * copy or not on return from the system call. The chip
 760                 * always resets it's tail register back to 0 on a
 761                 * transition from disabled to enabled.  This could cause a
 762                 * problem if software was broken, and did the enable w/o
 763                 * the disable, but eventually the in-memory copy will be
 764                 * updated and correct itself, even in the face of software
 765                 * bugs.
 766                 */
 767                if (pd->port_rcvhdrtail_kvaddr)
 768                        ipath_clear_rcvhdrtail(pd);
 769                set_bit(dd->ipath_r_portenable_shift + pd->port_port,
 770                        &dd->ipath_rcvctrl);
 771        } else
 772                clear_bit(dd->ipath_r_portenable_shift + pd->port_port,
 773                          &dd->ipath_rcvctrl);
 774        ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
 775                         dd->ipath_rcvctrl);
 776        /* now be sure chip saw it before we return */
 777        ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
 778        if (start_stop) {
 779                /*
 780                 * And try to be sure that tail reg update has happened too.
 781                 * This should in theory interlock with the RXE changes to
 782                 * the tail register.  Don't assign it to the tail register
 783                 * in memory copy, since we could overwrite an update by the
 784                 * chip if we did.
 785                 */
 786                ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port);
 787        }
 788        /* always; new head should be equal to new tail; see above */
 789bail:
 790        return 0;
 791}
 792
 793static void ipath_clean_part_key(struct ipath_portdata *pd,
 794                                 struct ipath_devdata *dd)
 795{
 796        int i, j, pchanged = 0;
 797        u64 oldpkey;
 798
 799        /* for debugging only */
 800        oldpkey = (u64) dd->ipath_pkeys[0] |
 801                ((u64) dd->ipath_pkeys[1] << 16) |
 802                ((u64) dd->ipath_pkeys[2] << 32) |
 803                ((u64) dd->ipath_pkeys[3] << 48);
 804
 805        for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) {
 806                if (!pd->port_pkeys[i])
 807                        continue;
 808                ipath_cdbg(VERBOSE, "look for key[%d] %hx in pkeys\n", i,
 809                           pd->port_pkeys[i]);
 810                for (j = 0; j < ARRAY_SIZE(dd->ipath_pkeys); j++) {
 811                        /* check for match independent of the global bit */
 812                        if ((dd->ipath_pkeys[j] & 0x7fff) !=
 813                            (pd->port_pkeys[i] & 0x7fff))
 814                                continue;
 815                        if (atomic_dec_and_test(&dd->ipath_pkeyrefs[j])) {
 816                                ipath_cdbg(VERBOSE, "p%u clear key "
 817                                           "%x matches #%d\n",
 818                                           pd->port_port,
 819                                           pd->port_pkeys[i], j);
 820                                ipath_stats.sps_pkeys[j] =
 821                                        dd->ipath_pkeys[j] = 0;
 822                                pchanged++;
 823                        }
 824                        else ipath_cdbg(
 825                                VERBOSE, "p%u key %x matches #%d, "
 826                                "but ref still %d\n", pd->port_port,
 827                                pd->port_pkeys[i], j,
 828                                atomic_read(&dd->ipath_pkeyrefs[j]));
 829                        break;
 830                }
 831                pd->port_pkeys[i] = 0;
 832        }
 833        if (pchanged) {
 834                u64 pkey = (u64) dd->ipath_pkeys[0] |
 835                        ((u64) dd->ipath_pkeys[1] << 16) |
 836                        ((u64) dd->ipath_pkeys[2] << 32) |
 837                        ((u64) dd->ipath_pkeys[3] << 48);
 838                ipath_cdbg(VERBOSE, "p%u old pkey reg %llx, "
 839                           "new pkey reg %llx\n", pd->port_port,
 840                           (unsigned long long) oldpkey,
 841                           (unsigned long long) pkey);
 842                ipath_write_kreg(dd, dd->ipath_kregs->kr_partitionkey,
 843                                 pkey);
 844        }
 845}
 846
 847/*
 848 * Initialize the port data with the receive buffer sizes
 849 * so this can be done while the master port is locked.
 850 * Otherwise, there is a race with a slave opening the port
 851 * and seeing these fields uninitialized.
 852 */
 853static void init_user_egr_sizes(struct ipath_portdata *pd)
 854{
 855        struct ipath_devdata *dd = pd->port_dd;
 856        unsigned egrperchunk, egrcnt, size;
 857
 858        /*
 859         * to avoid wasting a lot of memory, we allocate 32KB chunks of
 860         * physically contiguous memory, advance through it until used up
 861         * and then allocate more.  Of course, we need memory to store those
 862         * extra pointers, now.  Started out with 256KB, but under heavy
 863         * memory pressure (creating large files and then copying them over
 864         * NFS while doing lots of MPI jobs), we hit some allocation
 865         * failures, even though we can sleep...  (2.6.10) Still get
 866         * failures at 64K.  32K is the lowest we can go without wasting
 867         * additional memory.
 868         */
 869        size = 0x8000;
 870        egrperchunk = size / dd->ipath_rcvegrbufsize;
 871        egrcnt = dd->ipath_rcvegrcnt;
 872        pd->port_rcvegrbuf_chunks = (egrcnt + egrperchunk - 1) / egrperchunk;
 873        pd->port_rcvegrbufs_perchunk = egrperchunk;
 874        pd->port_rcvegrbuf_size = size;
 875}
 876
 877/**
 878 * ipath_create_user_egr - allocate eager TID buffers
 879 * @pd: the port to allocate TID buffers for
 880 *
 881 * This routine is now quite different for user and kernel, because
 882 * the kernel uses skb's, for the accelerated network performance
 883 * This is the user port version
 884 *
 885 * Allocate the eager TID buffers and program them into infinipath
 886 * They are no longer completely contiguous, we do multiple allocation
 887 * calls.
 888 */
 889static int ipath_create_user_egr(struct ipath_portdata *pd)
 890{
 891        struct ipath_devdata *dd = pd->port_dd;
 892        unsigned e, egrcnt, egrperchunk, chunk, egrsize, egroff;
 893        size_t size;
 894        int ret;
 895        gfp_t gfp_flags;
 896
 897        /*
 898         * GFP_USER, but without GFP_FS, so buffer cache can be
 899         * coalesced (we hope); otherwise, even at order 4,
 900         * heavy filesystem activity makes these fail, and we can
 901         * use compound pages.
 902         */
 903        gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP;
 904
 905        egrcnt = dd->ipath_rcvegrcnt;
 906        /* TID number offset for this port */
 907        egroff = (pd->port_port - 1) * egrcnt + dd->ipath_p0_rcvegrcnt;
 908        egrsize = dd->ipath_rcvegrbufsize;
 909        ipath_cdbg(VERBOSE, "Allocating %d egr buffers, at egrtid "
 910                   "offset %x, egrsize %u\n", egrcnt, egroff, egrsize);
 911
 912        chunk = pd->port_rcvegrbuf_chunks;
 913        egrperchunk = pd->port_rcvegrbufs_perchunk;
 914        size = pd->port_rcvegrbuf_size;
 915        pd->port_rcvegrbuf = kmalloc(chunk * sizeof(pd->port_rcvegrbuf[0]),
 916                                     GFP_KERNEL);
 917        if (!pd->port_rcvegrbuf) {
 918                ret = -ENOMEM;
 919                goto bail;
 920        }
 921        pd->port_rcvegrbuf_phys =
 922                kmalloc(chunk * sizeof(pd->port_rcvegrbuf_phys[0]),
 923                        GFP_KERNEL);
 924        if (!pd->port_rcvegrbuf_phys) {
 925                ret = -ENOMEM;
 926                goto bail_rcvegrbuf;
 927        }
 928        for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) {
 929
 930                pd->port_rcvegrbuf[e] = dma_alloc_coherent(
 931                        &dd->pcidev->dev, size, &pd->port_rcvegrbuf_phys[e],
 932                        gfp_flags);
 933
 934                if (!pd->port_rcvegrbuf[e]) {
 935                        ret = -ENOMEM;
 936                        goto bail_rcvegrbuf_phys;
 937                }
 938        }
 939
 940        pd->port_rcvegr_phys = pd->port_rcvegrbuf_phys[0];
 941
 942        for (e = chunk = 0; chunk < pd->port_rcvegrbuf_chunks; chunk++) {
 943                dma_addr_t pa = pd->port_rcvegrbuf_phys[chunk];
 944                unsigned i;
 945
 946                for (i = 0; e < egrcnt && i < egrperchunk; e++, i++) {
 947                        dd->ipath_f_put_tid(dd, e + egroff +
 948                                            (u64 __iomem *)
 949                                            ((char __iomem *)
 950                                             dd->ipath_kregbase +
 951                                             dd->ipath_rcvegrbase),
 952                                            RCVHQ_RCV_TYPE_EAGER, pa);
 953                        pa += egrsize;
 954                }
 955                cond_resched(); /* don't hog the cpu */
 956        }
 957
 958        ret = 0;
 959        goto bail;
 960
 961bail_rcvegrbuf_phys:
 962        for (e = 0; e < pd->port_rcvegrbuf_chunks &&
 963                pd->port_rcvegrbuf[e]; e++) {
 964                dma_free_coherent(&dd->pcidev->dev, size,
 965                                  pd->port_rcvegrbuf[e],
 966                                  pd->port_rcvegrbuf_phys[e]);
 967
 968        }
 969        kfree(pd->port_rcvegrbuf_phys);
 970        pd->port_rcvegrbuf_phys = NULL;
 971bail_rcvegrbuf:
 972        kfree(pd->port_rcvegrbuf);
 973        pd->port_rcvegrbuf = NULL;
 974bail:
 975        return ret;
 976}
 977
 978
 979/* common code for the mappings on dma_alloc_coherent mem */
 980static int ipath_mmap_mem(struct vm_area_struct *vma,
 981        struct ipath_portdata *pd, unsigned len, int write_ok,
 982        void *kvaddr, char *what)
 983{
 984        struct ipath_devdata *dd = pd->port_dd;
 985        unsigned long pfn;
 986        int ret;
 987
 988        if ((vma->vm_end - vma->vm_start) > len) {
 989                dev_info(&dd->pcidev->dev,
 990                         "FAIL on %s: len %lx > %x\n", what,
 991                         vma->vm_end - vma->vm_start, len);
 992                ret = -EFAULT;
 993                goto bail;
 994        }
 995
 996        if (!write_ok) {
 997                if (vma->vm_flags & VM_WRITE) {
 998                        dev_info(&dd->pcidev->dev,
 999                                 "%s must be mapped readonly\n", what);
1000                        ret = -EPERM;
1001                        goto bail;
1002                }
1003
1004                /* don't allow them to later change with mprotect */
1005                vma->vm_flags &= ~VM_MAYWRITE;
1006        }
1007
1008        pfn = virt_to_phys(kvaddr) >> PAGE_SHIFT;
1009        ret = remap_pfn_range(vma, vma->vm_start, pfn,
1010                              len, vma->vm_page_prot);
1011        if (ret)
1012                dev_info(&dd->pcidev->dev, "%s port%u mmap of %lx, %x "
1013                         "bytes r%c failed: %d\n", what, pd->port_port,
1014                         pfn, len, write_ok?'w':'o', ret);
1015        else
1016                ipath_cdbg(VERBOSE, "%s port%u mmaped %lx, %x bytes "
1017                           "r%c\n", what, pd->port_port, pfn, len,
1018                           write_ok?'w':'o');
1019bail:
1020        return ret;
1021}
1022
1023static int mmap_ureg(struct vm_area_struct *vma, struct ipath_devdata *dd,
1024                     u64 ureg)
1025{
1026        unsigned long phys;
1027        int ret;
1028
1029        /*
1030         * This is real hardware, so use io_remap.  This is the mechanism
1031         * for the user process to update the head registers for their port
1032         * in the chip.
1033         */
1034        if ((vma->vm_end - vma->vm_start) > PAGE_SIZE) {
1035                dev_info(&dd->pcidev->dev, "FAIL mmap userreg: reqlen "
1036                         "%lx > PAGE\n", vma->vm_end - vma->vm_start);
1037                ret = -EFAULT;
1038        } else {
1039                phys = dd->ipath_physaddr + ureg;
1040                vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
1041
1042                vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
1043                ret = io_remap_pfn_range(vma, vma->vm_start,
1044                                         phys >> PAGE_SHIFT,
1045                                         vma->vm_end - vma->vm_start,
1046                                         vma->vm_page_prot);
1047        }
1048        return ret;
1049}
1050
1051static int mmap_piobufs(struct vm_area_struct *vma,
1052                        struct ipath_devdata *dd,
1053                        struct ipath_portdata *pd,
1054                        unsigned piobufs, unsigned piocnt)
1055{
1056        unsigned long phys;
1057        int ret;
1058
1059        /*
1060         * When we map the PIO buffers in the chip, we want to map them as
1061         * writeonly, no read possible.   This prevents access to previous
1062         * process data, and catches users who might try to read the i/o
1063         * space due to a bug.
1064         */
1065        if ((vma->vm_end - vma->vm_start) > (piocnt * dd->ipath_palign)) {
1066                dev_info(&dd->pcidev->dev, "FAIL mmap piobufs: "
1067                         "reqlen %lx > PAGE\n",
1068                         vma->vm_end - vma->vm_start);
1069                ret = -EINVAL;
1070                goto bail;
1071        }
1072
1073        phys = dd->ipath_physaddr + piobufs;
1074
1075#if defined(__powerpc__)
1076        /* There isn't a generic way to specify writethrough mappings */
1077        pgprot_val(vma->vm_page_prot) |= _PAGE_NO_CACHE;
1078        pgprot_val(vma->vm_page_prot) |= _PAGE_WRITETHRU;
1079        pgprot_val(vma->vm_page_prot) &= ~_PAGE_GUARDED;
1080#endif
1081
1082        /*
1083         * don't allow them to later change to readable with mprotect (for when
1084         * not initially mapped readable, as is normally the case)
1085         */
1086        vma->vm_flags &= ~VM_MAYREAD;
1087        vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
1088
1089        ret = io_remap_pfn_range(vma, vma->vm_start, phys >> PAGE_SHIFT,
1090                                 vma->vm_end - vma->vm_start,
1091                                 vma->vm_page_prot);
1092bail:
1093        return ret;
1094}
1095
1096static int mmap_rcvegrbufs(struct vm_area_struct *vma,
1097                           struct ipath_portdata *pd)
1098{
1099        struct ipath_devdata *dd = pd->port_dd;
1100        unsigned long start, size;
1101        size_t total_size, i;
1102        unsigned long pfn;
1103        int ret;
1104
1105        size = pd->port_rcvegrbuf_size;
1106        total_size = pd->port_rcvegrbuf_chunks * size;
1107        if ((vma->vm_end - vma->vm_start) > total_size) {
1108                dev_info(&dd->pcidev->dev, "FAIL on egr bufs: "
1109                         "reqlen %lx > actual %lx\n",
1110                         vma->vm_end - vma->vm_start,
1111                         (unsigned long) total_size);
1112                ret = -EINVAL;
1113                goto bail;
1114        }
1115
1116        if (vma->vm_flags & VM_WRITE) {
1117                dev_info(&dd->pcidev->dev, "Can't map eager buffers as "
1118                         "writable (flags=%lx)\n", vma->vm_flags);
1119                ret = -EPERM;
1120                goto bail;
1121        }
1122        /* don't allow them to later change to writeable with mprotect */
1123        vma->vm_flags &= ~VM_MAYWRITE;
1124
1125        start = vma->vm_start;
1126
1127        for (i = 0; i < pd->port_rcvegrbuf_chunks; i++, start += size) {
1128                pfn = virt_to_phys(pd->port_rcvegrbuf[i]) >> PAGE_SHIFT;
1129                ret = remap_pfn_range(vma, start, pfn, size,
1130                                      vma->vm_page_prot);
1131                if (ret < 0)
1132                        goto bail;
1133        }
1134        ret = 0;
1135
1136bail:
1137        return ret;
1138}
1139
1140/*
1141 * ipath_file_vma_fault - handle a VMA page fault.
1142 */
1143static int ipath_file_vma_fault(struct vm_area_struct *vma,
1144                                        struct vm_fault *vmf)
1145{
1146        struct page *page;
1147
1148        page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT));
1149        if (!page)
1150                return VM_FAULT_SIGBUS;
1151        get_page(page);
1152        vmf->page = page;
1153
1154        return 0;
1155}
1156
1157static const struct vm_operations_struct ipath_file_vm_ops = {
1158        .fault = ipath_file_vma_fault,
1159};
1160
1161static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr,
1162                       struct ipath_portdata *pd, unsigned subport)
1163{
1164        unsigned long len;
1165        struct ipath_devdata *dd;
1166        void *addr;
1167        size_t size;
1168        int ret = 0;
1169
1170        /* If the port is not shared, all addresses should be physical */
1171        if (!pd->port_subport_cnt)
1172                goto bail;
1173
1174        dd = pd->port_dd;
1175        size = pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size;
1176
1177        /*
1178         * Each process has all the subport uregbase, rcvhdrq, and
1179         * rcvegrbufs mmapped - as an array for all the processes,
1180         * and also separately for this process.
1181         */
1182        if (pgaddr == cvt_kvaddr(pd->subport_uregbase)) {
1183                addr = pd->subport_uregbase;
1184                size = PAGE_SIZE * pd->port_subport_cnt;
1185        } else if (pgaddr == cvt_kvaddr(pd->subport_rcvhdr_base)) {
1186                addr = pd->subport_rcvhdr_base;
1187                size = pd->port_rcvhdrq_size * pd->port_subport_cnt;
1188        } else if (pgaddr == cvt_kvaddr(pd->subport_rcvegrbuf)) {
1189                addr = pd->subport_rcvegrbuf;
1190                size *= pd->port_subport_cnt;
1191        } else if (pgaddr == cvt_kvaddr(pd->subport_uregbase +
1192                                        PAGE_SIZE * subport)) {
1193                addr = pd->subport_uregbase + PAGE_SIZE * subport;
1194                size = PAGE_SIZE;
1195        } else if (pgaddr == cvt_kvaddr(pd->subport_rcvhdr_base +
1196                                pd->port_rcvhdrq_size * subport)) {
1197                addr = pd->subport_rcvhdr_base +
1198                        pd->port_rcvhdrq_size * subport;
1199                size = pd->port_rcvhdrq_size;
1200        } else if (pgaddr == cvt_kvaddr(pd->subport_rcvegrbuf +
1201                               size * subport)) {
1202                addr = pd->subport_rcvegrbuf + size * subport;
1203                /* rcvegrbufs are read-only on the slave */
1204                if (vma->vm_flags & VM_WRITE) {
1205                        dev_info(&dd->pcidev->dev,
1206                                 "Can't map eager buffers as "
1207                                 "writable (flags=%lx)\n", vma->vm_flags);
1208                        ret = -EPERM;
1209                        goto bail;
1210                }
1211                /*
1212                 * Don't allow permission to later change to writeable
1213                 * with mprotect.
1214                 */
1215                vma->vm_flags &= ~VM_MAYWRITE;
1216        } else {
1217                goto bail;
1218        }
1219        len = vma->vm_end - vma->vm_start;
1220        if (len > size) {
1221                ipath_cdbg(MM, "FAIL: reqlen %lx > %zx\n", len, size);
1222                ret = -EINVAL;
1223                goto bail;
1224        }
1225
1226        vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT;
1227        vma->vm_ops = &ipath_file_vm_ops;
1228        vma->vm_flags |= VM_RESERVED | VM_DONTEXPAND;
1229        ret = 1;
1230
1231bail:
1232        return ret;
1233}
1234
1235/**
1236 * ipath_mmap - mmap various structures into user space
1237 * @fp: the file pointer
1238 * @vma: the VM area
1239 *
1240 * We use this to have a shared buffer between the kernel and the user code
1241 * for the rcvhdr queue, egr buffers, and the per-port user regs and pio
1242 * buffers in the chip.  We have the open and close entries so we can bump
1243 * the ref count and keep the driver from being unloaded while still mapped.
1244 */
1245static int ipath_mmap(struct file *fp, struct vm_area_struct *vma)
1246{
1247        struct ipath_portdata *pd;
1248        struct ipath_devdata *dd;
1249        u64 pgaddr, ureg;
1250        unsigned piobufs, piocnt;
1251        int ret;
1252
1253        pd = port_fp(fp);
1254        if (!pd) {
1255                ret = -EINVAL;
1256                goto bail;
1257        }
1258        dd = pd->port_dd;
1259
1260        /*
1261         * This is the ipath_do_user_init() code, mapping the shared buffers
1262         * into the user process. The address referred to by vm_pgoff is the
1263         * file offset passed via mmap().  For shared ports, this is the
1264         * kernel vmalloc() address of the pages to share with the master.
1265         * For non-shared or master ports, this is a physical address.
1266         * We only do one mmap for each space mapped.
1267         */
1268        pgaddr = vma->vm_pgoff << PAGE_SHIFT;
1269
1270        /*
1271         * Check for 0 in case one of the allocations failed, but user
1272         * called mmap anyway.
1273         */
1274        if (!pgaddr)  {
1275                ret = -EINVAL;
1276                goto bail;
1277        }
1278
1279        ipath_cdbg(MM, "pgaddr %llx vm_start=%lx len %lx port %u:%u:%u\n",
1280                   (unsigned long long) pgaddr, vma->vm_start,
1281                   vma->vm_end - vma->vm_start, dd->ipath_unit,
1282                   pd->port_port, subport_fp(fp));
1283
1284        /*
1285         * Physical addresses must fit in 40 bits for our hardware.
1286         * Check for kernel virtual addresses first, anything else must
1287         * match a HW or memory address.
1288         */
1289        ret = mmap_kvaddr(vma, pgaddr, pd, subport_fp(fp));
1290        if (ret) {
1291                if (ret > 0)
1292                        ret = 0;
1293                goto bail;
1294        }
1295
1296        ureg = dd->ipath_uregbase + dd->ipath_ureg_align * pd->port_port;
1297        if (!pd->port_subport_cnt) {
1298                /* port is not shared */
1299                piocnt = pd->port_piocnt;
1300                piobufs = pd->port_piobufs;
1301        } else if (!subport_fp(fp)) {
1302                /* caller is the master */
1303                piocnt = (pd->port_piocnt / pd->port_subport_cnt) +
1304                         (pd->port_piocnt % pd->port_subport_cnt);
1305                piobufs = pd->port_piobufs +
1306                        dd->ipath_palign * (pd->port_piocnt - piocnt);
1307        } else {
1308                unsigned slave = subport_fp(fp) - 1;
1309
1310                /* caller is a slave */
1311                piocnt = pd->port_piocnt / pd->port_subport_cnt;
1312                piobufs = pd->port_piobufs + dd->ipath_palign * piocnt * slave;
1313        }
1314
1315        if (pgaddr == ureg)
1316                ret = mmap_ureg(vma, dd, ureg);
1317        else if (pgaddr == piobufs)
1318                ret = mmap_piobufs(vma, dd, pd, piobufs, piocnt);
1319        else if (pgaddr == dd->ipath_pioavailregs_phys)
1320                /* in-memory copy of pioavail registers */
1321                ret = ipath_mmap_mem(vma, pd, PAGE_SIZE, 0,
1322                                     (void *) dd->ipath_pioavailregs_dma,
1323                                     "pioavail registers");
1324        else if (pgaddr == pd->port_rcvegr_phys)
1325                ret = mmap_rcvegrbufs(vma, pd);
1326        else if (pgaddr == (u64) pd->port_rcvhdrq_phys)
1327                /*
1328                 * The rcvhdrq itself; readonly except on HT (so have
1329                 * to allow writable mapping), multiple pages, contiguous
1330                 * from an i/o perspective.
1331                 */
1332                ret = ipath_mmap_mem(vma, pd, pd->port_rcvhdrq_size, 1,
1333                                     pd->port_rcvhdrq,
1334                                     "rcvhdrq");
1335        else if (pgaddr == (u64) pd->port_rcvhdrqtailaddr_phys)
1336                /* in-memory copy of rcvhdrq tail register */
1337                ret = ipath_mmap_mem(vma, pd, PAGE_SIZE, 0,
1338                                     pd->port_rcvhdrtail_kvaddr,
1339                                     "rcvhdrq tail");
1340        else
1341                ret = -EINVAL;
1342
1343        vma->vm_private_data = NULL;
1344
1345        if (ret < 0)
1346                dev_info(&dd->pcidev->dev,
1347                         "Failure %d on off %llx len %lx\n",
1348                         -ret, (unsigned long long)pgaddr,
1349                         vma->vm_end - vma->vm_start);
1350bail:
1351        return ret;
1352}
1353
1354static unsigned ipath_poll_hdrqfull(struct ipath_portdata *pd)
1355{
1356        unsigned pollflag = 0;
1357
1358        if ((pd->poll_type & IPATH_POLL_TYPE_OVERFLOW) &&
1359            pd->port_hdrqfull != pd->port_hdrqfull_poll) {
1360                pollflag |= POLLIN | POLLRDNORM;
1361                pd->port_hdrqfull_poll = pd->port_hdrqfull;
1362        }
1363
1364        return pollflag;
1365}
1366
1367static unsigned int ipath_poll_urgent(struct ipath_portdata *pd,
1368                                      struct file *fp,
1369                                      struct poll_table_struct *pt)
1370{
1371        unsigned pollflag = 0;
1372        struct ipath_devdata *dd;
1373
1374        dd = pd->port_dd;
1375
1376        /* variable access in ipath_poll_hdrqfull() needs this */
1377        rmb();
1378        pollflag = ipath_poll_hdrqfull(pd);
1379
1380        if (pd->port_urgent != pd->port_urgent_poll) {
1381                pollflag |= POLLIN | POLLRDNORM;
1382                pd->port_urgent_poll = pd->port_urgent;
1383        }
1384
1385        if (!pollflag) {
1386                /* this saves a spin_lock/unlock in interrupt handler... */
1387                set_bit(IPATH_PORT_WAITING_URG, &pd->port_flag);
1388                /* flush waiting flag so don't miss an event... */
1389                wmb();
1390                poll_wait(fp, &pd->port_wait, pt);
1391        }
1392
1393        return pollflag;
1394}
1395
1396static unsigned int ipath_poll_next(struct ipath_portdata *pd,
1397                                    struct file *fp,
1398                                    struct poll_table_struct *pt)
1399{
1400        u32 head;
1401        u32 tail;
1402        unsigned pollflag = 0;
1403        struct ipath_devdata *dd;
1404
1405        dd = pd->port_dd;
1406
1407        /* variable access in ipath_poll_hdrqfull() needs this */
1408        rmb();
1409        pollflag = ipath_poll_hdrqfull(pd);
1410
1411        head = ipath_read_ureg32(dd, ur_rcvhdrhead, pd->port_port);
1412        if (pd->port_rcvhdrtail_kvaddr)
1413                tail = ipath_get_rcvhdrtail(pd);
1414        else
1415                tail = ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port);
1416
1417        if (head != tail)
1418                pollflag |= POLLIN | POLLRDNORM;
1419        else {
1420                /* this saves a spin_lock/unlock in interrupt handler */
1421                set_bit(IPATH_PORT_WAITING_RCV, &pd->port_flag);
1422                /* flush waiting flag so we don't miss an event */
1423                wmb();
1424
1425                set_bit(pd->port_port + dd->ipath_r_intravail_shift,
1426                        &dd->ipath_rcvctrl);
1427
1428                ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
1429                                 dd->ipath_rcvctrl);
1430
1431                if (dd->ipath_rhdrhead_intr_off) /* arm rcv interrupt */
1432                        ipath_write_ureg(dd, ur_rcvhdrhead,
1433                                         dd->ipath_rhdrhead_intr_off | head,
1434                                         pd->port_port);
1435
1436                poll_wait(fp, &pd->port_wait, pt);
1437        }
1438
1439        return pollflag;
1440}
1441
1442static unsigned int ipath_poll(struct file *fp,
1443                               struct poll_table_struct *pt)
1444{
1445        struct ipath_portdata *pd;
1446        unsigned pollflag;
1447
1448        pd = port_fp(fp);
1449        if (!pd)
1450                pollflag = 0;
1451        else if (pd->poll_type & IPATH_POLL_TYPE_URGENT)
1452                pollflag = ipath_poll_urgent(pd, fp, pt);
1453        else
1454                pollflag = ipath_poll_next(pd, fp, pt);
1455
1456        return pollflag;
1457}
1458
1459static int ipath_supports_subports(int user_swmajor, int user_swminor)
1460{
1461        /* no subport implementation prior to software version 1.3 */
1462        return (user_swmajor > 1) || (user_swminor >= 3);
1463}
1464
1465static int ipath_compatible_subports(int user_swmajor, int user_swminor)
1466{
1467        /* this code is written long-hand for clarity */
1468        if (IPATH_USER_SWMAJOR != user_swmajor) {
1469                /* no promise of compatibility if major mismatch */
1470                return 0;
1471        }
1472        if (IPATH_USER_SWMAJOR == 1) {
1473                switch (IPATH_USER_SWMINOR) {
1474                case 0:
1475                case 1:
1476                case 2:
1477                        /* no subport implementation so cannot be compatible */
1478                        return 0;
1479                case 3:
1480                        /* 3 is only compatible with itself */
1481                        return user_swminor == 3;
1482                default:
1483                        /* >= 4 are compatible (or are expected to be) */
1484                        return user_swminor >= 4;
1485                }
1486        }
1487        /* make no promises yet for future major versions */
1488        return 0;
1489}
1490
1491static int init_subports(struct ipath_devdata *dd,
1492                         struct ipath_portdata *pd,
1493                         const struct ipath_user_info *uinfo)
1494{
1495        int ret = 0;
1496        unsigned num_subports;
1497        size_t size;
1498
1499        /*
1500         * If the user is requesting zero subports,
1501         * skip the subport allocation.
1502         */
1503        if (uinfo->spu_subport_cnt <= 0)
1504                goto bail;
1505
1506        /* Self-consistency check for ipath_compatible_subports() */
1507        if (ipath_supports_subports(IPATH_USER_SWMAJOR, IPATH_USER_SWMINOR) &&
1508            !ipath_compatible_subports(IPATH_USER_SWMAJOR,
1509                                       IPATH_USER_SWMINOR)) {
1510                dev_info(&dd->pcidev->dev,
1511                         "Inconsistent ipath_compatible_subports()\n");
1512                goto bail;
1513        }
1514
1515        /* Check for subport compatibility */
1516        if (!ipath_compatible_subports(uinfo->spu_userversion >> 16,
1517                                       uinfo->spu_userversion & 0xffff)) {
1518                dev_info(&dd->pcidev->dev,
1519                         "Mismatched user version (%d.%d) and driver "
1520                         "version (%d.%d) while port sharing. Ensure "
1521                         "that driver and library are from the same "
1522                         "release.\n",
1523                         (int) (uinfo->spu_userversion >> 16),
1524                         (int) (uinfo->spu_userversion & 0xffff),
1525                         IPATH_USER_SWMAJOR,
1526                         IPATH_USER_SWMINOR);
1527                goto bail;
1528        }
1529        if (uinfo->spu_subport_cnt > INFINIPATH_MAX_SUBPORT) {
1530                ret = -EINVAL;
1531                goto bail;
1532        }
1533
1534        num_subports = uinfo->spu_subport_cnt;
1535        pd->subport_uregbase = vzalloc(PAGE_SIZE * num_subports);
1536        if (!pd->subport_uregbase) {
1537                ret = -ENOMEM;
1538                goto bail;
1539        }
1540        /* Note: pd->port_rcvhdrq_size isn't initialized yet. */
1541        size = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize *
1542                     sizeof(u32), PAGE_SIZE) * num_subports;
1543        pd->subport_rcvhdr_base = vzalloc(size);
1544        if (!pd->subport_rcvhdr_base) {
1545                ret = -ENOMEM;
1546                goto bail_ureg;
1547        }
1548
1549        pd->subport_rcvegrbuf = vzalloc(pd->port_rcvegrbuf_chunks *
1550                                        pd->port_rcvegrbuf_size *
1551                                        num_subports);
1552        if (!pd->subport_rcvegrbuf) {
1553                ret = -ENOMEM;
1554                goto bail_rhdr;
1555        }
1556
1557        pd->port_subport_cnt = uinfo->spu_subport_cnt;
1558        pd->port_subport_id = uinfo->spu_subport_id;
1559        pd->active_slaves = 1;
1560        set_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag);
1561        goto bail;
1562
1563bail_rhdr:
1564        vfree(pd->subport_rcvhdr_base);
1565bail_ureg:
1566        vfree(pd->subport_uregbase);
1567        pd->subport_uregbase = NULL;
1568bail:
1569        return ret;
1570}
1571
1572static int try_alloc_port(struct ipath_devdata *dd, int port,
1573                          struct file *fp,
1574                          const struct ipath_user_info *uinfo)
1575{
1576        struct ipath_portdata *pd;
1577        int ret;
1578
1579        if (!(pd = dd->ipath_pd[port])) {
1580                void *ptmp;
1581
1582                pd = kzalloc(sizeof(struct ipath_portdata), GFP_KERNEL);
1583
1584                /*
1585                 * Allocate memory for use in ipath_tid_update() just once
1586                 * at open, not per call.  Reduces cost of expected send
1587                 * setup.
1588                 */
1589                ptmp = kmalloc(dd->ipath_rcvtidcnt * sizeof(u16) +
1590                               dd->ipath_rcvtidcnt * sizeof(struct page **),
1591                               GFP_KERNEL);
1592                if (!pd || !ptmp) {
1593                        ipath_dev_err(dd, "Unable to allocate portdata "
1594                                      "memory, failing open\n");
1595                        ret = -ENOMEM;
1596                        kfree(pd);
1597                        kfree(ptmp);
1598                        goto bail;
1599                }
1600                dd->ipath_pd[port] = pd;
1601                dd->ipath_pd[port]->port_port = port;
1602                dd->ipath_pd[port]->port_dd = dd;
1603                dd->ipath_pd[port]->port_tid_pg_list = ptmp;
1604                init_waitqueue_head(&dd->ipath_pd[port]->port_wait);
1605        }
1606        if (!pd->port_cnt) {
1607                pd->userversion = uinfo->spu_userversion;
1608                init_user_egr_sizes(pd);
1609                if ((ret = init_subports(dd, pd, uinfo)) != 0)
1610                        goto bail;
1611                ipath_cdbg(PROC, "%s[%u] opened unit:port %u:%u\n",
1612                           current->comm, current->pid, dd->ipath_unit,
1613                           port);
1614                pd->port_cnt = 1;
1615                port_fp(fp) = pd;
1616                pd->port_pid = get_pid(task_pid(current));
1617                strlcpy(pd->port_comm, current->comm, sizeof(pd->port_comm));
1618                ipath_stats.sps_ports++;
1619                ret = 0;
1620        } else
1621                ret = -EBUSY;
1622
1623bail:
1624        return ret;
1625}
1626
1627static inline int usable(struct ipath_devdata *dd)
1628{
1629        return dd &&
1630                (dd->ipath_flags & IPATH_PRESENT) &&
1631                dd->ipath_kregbase &&
1632                dd->ipath_lid &&
1633                !(dd->ipath_flags & (IPATH_LINKDOWN | IPATH_DISABLED
1634                                     | IPATH_LINKUNK));
1635}
1636
1637static int find_free_port(int unit, struct file *fp,
1638                          const struct ipath_user_info *uinfo)
1639{
1640        struct ipath_devdata *dd = ipath_lookup(unit);
1641        int ret, i;
1642
1643        if (!dd) {
1644                ret = -ENODEV;
1645                goto bail;
1646        }
1647
1648        if (!usable(dd)) {
1649                ret = -ENETDOWN;
1650                goto bail;
1651        }
1652
1653        for (i = 1; i < dd->ipath_cfgports; i++) {
1654                ret = try_alloc_port(dd, i, fp, uinfo);
1655                if (ret != -EBUSY)
1656                        goto bail;
1657        }
1658        ret = -EBUSY;
1659
1660bail:
1661        return ret;
1662}
1663
1664static int find_best_unit(struct file *fp,
1665                          const struct ipath_user_info *uinfo)
1666{
1667        int ret = 0, i, prefunit = -1, devmax;
1668        int maxofallports, npresent, nup;
1669        int ndev;
1670
1671        devmax = ipath_count_units(&npresent, &nup, &maxofallports);
1672
1673        /*
1674         * This code is present to allow a knowledgeable person to
1675         * specify the layout of processes to processors before opening
1676         * this driver, and then we'll assign the process to the "closest"
1677         * InfiniPath chip to that processor (we assume reasonable connectivity,
1678         * for now).  This code assumes that if affinity has been set
1679         * before this point, that at most one cpu is set; for now this
1680         * is reasonable.  I check for both cpumask_empty() and cpumask_full(),
1681         * in case some kernel variant sets none of the bits when no
1682         * affinity is set.  2.6.11 and 12 kernels have all present
1683         * cpus set.  Some day we'll have to fix it up further to handle
1684         * a cpu subset.  This algorithm fails for two HT chips connected
1685         * in tunnel fashion.  Eventually this needs real topology
1686         * information.  There may be some issues with dual core numbering
1687         * as well.  This needs more work prior to release.
1688         */
1689        if (!cpumask_empty(tsk_cpus_allowed(current)) &&
1690            !cpumask_full(tsk_cpus_allowed(current))) {
1691                int ncpus = num_online_cpus(), curcpu = -1, nset = 0;
1692                get_online_cpus();
1693                for_each_online_cpu(i)
1694                        if (cpumask_test_cpu(i, tsk_cpus_allowed(current))) {
1695                                ipath_cdbg(PROC, "%s[%u] affinity set for "
1696                                           "cpu %d/%d\n", current->comm,
1697                                           current->pid, i, ncpus);
1698                                curcpu = i;
1699                                nset++;
1700                        }
1701                put_online_cpus();
1702                if (curcpu != -1 && nset != ncpus) {
1703                        if (npresent) {
1704                                prefunit = curcpu / (ncpus / npresent);
1705                                ipath_cdbg(PROC,"%s[%u] %d chips, %d cpus, "
1706                                          "%d cpus/chip, select unit %d\n",
1707                                          current->comm, current->pid,
1708                                          npresent, ncpus, ncpus / npresent,
1709                                          prefunit);
1710                        }
1711                }
1712        }
1713
1714        /*
1715         * user ports start at 1, kernel port is 0
1716         * For now, we do round-robin access across all chips
1717         */
1718
1719        if (prefunit != -1)
1720                devmax = prefunit + 1;
1721recheck:
1722        for (i = 1; i < maxofallports; i++) {
1723                for (ndev = prefunit != -1 ? prefunit : 0; ndev < devmax;
1724                     ndev++) {
1725                        struct ipath_devdata *dd = ipath_lookup(ndev);
1726
1727                        if (!usable(dd))
1728                                continue; /* can't use this unit */
1729                        if (i >= dd->ipath_cfgports)
1730                                /*
1731                                 * Maxed out on users of this unit. Try
1732                                 * next.
1733                                 */
1734                                continue;
1735                        ret = try_alloc_port(dd, i, fp, uinfo);
1736                        if (!ret)
1737                                goto done;
1738                }
1739        }
1740
1741        if (npresent) {
1742                if (nup == 0) {
1743                        ret = -ENETDOWN;
1744                        ipath_dbg("No ports available (none initialized "
1745                                  "and ready)\n");
1746                } else {
1747                        if (prefunit > 0) {
1748                                /* if started above 0, retry from 0 */
1749                                ipath_cdbg(PROC,
1750                                           "%s[%u] no ports on prefunit "
1751                                           "%d, clear and re-check\n",
1752                                           current->comm, current->pid,
1753                                           prefunit);
1754                                devmax = ipath_count_units(NULL, NULL,
1755                                                           NULL);
1756                                prefunit = -1;
1757                                goto recheck;
1758                        }
1759                        ret = -EBUSY;
1760                        ipath_dbg("No ports available\n");
1761                }
1762        } else {
1763                ret = -ENXIO;
1764                ipath_dbg("No boards found\n");
1765        }
1766
1767done:
1768        return ret;
1769}
1770
1771static int find_shared_port(struct file *fp,
1772                            const struct ipath_user_info *uinfo)
1773{
1774        int devmax, ndev, i;
1775        int ret = 0;
1776
1777        devmax = ipath_count_units(NULL, NULL, NULL);
1778
1779        for (ndev = 0; ndev < devmax; ndev++) {
1780                struct ipath_devdata *dd = ipath_lookup(ndev);
1781
1782                if (!usable(dd))
1783                        continue;
1784                for (i = 1; i < dd->ipath_cfgports; i++) {
1785                        struct ipath_portdata *pd = dd->ipath_pd[i];
1786
1787                        /* Skip ports which are not yet open */
1788                        if (!pd || !pd->port_cnt)
1789                                continue;
1790                        /* Skip port if it doesn't match the requested one */
1791                        if (pd->port_subport_id != uinfo->spu_subport_id)
1792                                continue;
1793                        /* Verify the sharing process matches the master */
1794                        if (pd->port_subport_cnt != uinfo->spu_subport_cnt ||
1795                            pd->userversion != uinfo->spu_userversion ||
1796                            pd->port_cnt >= pd->port_subport_cnt) {
1797                                ret = -EINVAL;
1798                                goto done;
1799                        }
1800                        port_fp(fp) = pd;
1801                        subport_fp(fp) = pd->port_cnt++;
1802                        pd->port_subpid[subport_fp(fp)] =
1803                                get_pid(task_pid(current));
1804                        tidcursor_fp(fp) = 0;
1805                        pd->active_slaves |= 1 << subport_fp(fp);
1806                        ipath_cdbg(PROC,
1807                                   "%s[%u] %u sharing %s[%u] unit:port %u:%u\n",
1808                                   current->comm, current->pid,
1809                                   subport_fp(fp),
1810                                   pd->port_comm, pid_nr(pd->port_pid),
1811                                   dd->ipath_unit, pd->port_port);
1812                        ret = 1;
1813                        goto done;
1814                }
1815        }
1816
1817done:
1818        return ret;
1819}
1820
1821static int ipath_open(struct inode *in, struct file *fp)
1822{
1823        /* The real work is performed later in ipath_assign_port() */
1824        fp->private_data = kzalloc(sizeof(struct ipath_filedata), GFP_KERNEL);
1825        return fp->private_data ? 0 : -ENOMEM;
1826}
1827
1828/* Get port early, so can set affinity prior to memory allocation */
1829static int ipath_assign_port(struct file *fp,
1830                              const struct ipath_user_info *uinfo)
1831{
1832        int ret;
1833        int i_minor;
1834        unsigned swmajor, swminor;
1835
1836        /* Check to be sure we haven't already initialized this file */
1837        if (port_fp(fp)) {
1838                ret = -EINVAL;
1839                goto done;
1840        }
1841
1842        /* for now, if major version is different, bail */
1843        swmajor = uinfo->spu_userversion >> 16;
1844        if (swmajor != IPATH_USER_SWMAJOR) {
1845                ipath_dbg("User major version %d not same as driver "
1846                          "major %d\n", uinfo->spu_userversion >> 16,
1847                          IPATH_USER_SWMAJOR);
1848                ret = -ENODEV;
1849                goto done;
1850        }
1851
1852        swminor = uinfo->spu_userversion & 0xffff;
1853        if (swminor != IPATH_USER_SWMINOR)
1854                ipath_dbg("User minor version %d not same as driver "
1855                          "minor %d\n", swminor, IPATH_USER_SWMINOR);
1856
1857        mutex_lock(&ipath_mutex);
1858
1859        if (ipath_compatible_subports(swmajor, swminor) &&
1860            uinfo->spu_subport_cnt &&
1861            (ret = find_shared_port(fp, uinfo))) {
1862                if (ret > 0)
1863                        ret = 0;
1864                goto done_chk_sdma;
1865        }
1866
1867        i_minor = iminor(fp->f_path.dentry->d_inode) - IPATH_USER_MINOR_BASE;
1868        ipath_cdbg(VERBOSE, "open on dev %lx (minor %d)\n",
1869                   (long)fp->f_path.dentry->d_inode->i_rdev, i_minor);
1870
1871        if (i_minor)
1872                ret = find_free_port(i_minor - 1, fp, uinfo);
1873        else
1874                ret = find_best_unit(fp, uinfo);
1875
1876done_chk_sdma:
1877        if (!ret) {
1878                struct ipath_filedata *fd = fp->private_data;
1879                const struct ipath_portdata *pd = fd->pd;
1880                const struct ipath_devdata *dd = pd->port_dd;
1881
1882                fd->pq = ipath_user_sdma_queue_create(&dd->pcidev->dev,
1883                                                      dd->ipath_unit,
1884                                                      pd->port_port,
1885                                                      fd->subport);
1886
1887                if (!fd->pq)
1888                        ret = -ENOMEM;
1889        }
1890
1891        mutex_unlock(&ipath_mutex);
1892
1893done:
1894        return ret;
1895}
1896
1897
1898static int ipath_do_user_init(struct file *fp,
1899                              const struct ipath_user_info *uinfo)
1900{
1901        int ret;
1902        struct ipath_portdata *pd = port_fp(fp);
1903        struct ipath_devdata *dd;
1904        u32 head32;
1905
1906        /* Subports don't need to initialize anything since master did it. */
1907        if (subport_fp(fp)) {
1908                ret = wait_event_interruptible(pd->port_wait,
1909                        !test_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag));
1910                goto done;
1911        }
1912
1913        dd = pd->port_dd;
1914
1915        if (uinfo->spu_rcvhdrsize) {
1916                ret = ipath_setrcvhdrsize(dd, uinfo->spu_rcvhdrsize);
1917                if (ret)
1918                        goto done;
1919        }
1920
1921        /* for now we do nothing with rcvhdrcnt: uinfo->spu_rcvhdrcnt */
1922
1923        /* some ports may get extra buffers, calculate that here */
1924        if (pd->port_port <= dd->ipath_ports_extrabuf)
1925                pd->port_piocnt = dd->ipath_pbufsport + 1;
1926        else
1927                pd->port_piocnt = dd->ipath_pbufsport;
1928
1929        /* for right now, kernel piobufs are at end, so port 1 is at 0 */
1930        if (pd->port_port <= dd->ipath_ports_extrabuf)
1931                pd->port_pio_base = (dd->ipath_pbufsport + 1)
1932                        * (pd->port_port - 1);
1933        else
1934                pd->port_pio_base = dd->ipath_ports_extrabuf +
1935                        dd->ipath_pbufsport * (pd->port_port - 1);
1936        pd->port_piobufs = dd->ipath_piobufbase +
1937                pd->port_pio_base * dd->ipath_palign;
1938        ipath_cdbg(VERBOSE, "piobuf base for port %u is 0x%x, piocnt %u,"
1939                " first pio %u\n", pd->port_port, pd->port_piobufs,
1940                pd->port_piocnt, pd->port_pio_base);
1941        ipath_chg_pioavailkernel(dd, pd->port_pio_base, pd->port_piocnt, 0);
1942
1943        /*
1944         * Now allocate the rcvhdr Q and eager TIDs; skip the TID
1945         * array for time being.  If pd->port_port > chip-supported,
1946         * we need to do extra stuff here to handle by handling overflow
1947         * through port 0, someday
1948         */
1949        ret = ipath_create_rcvhdrq(dd, pd);
1950        if (!ret)
1951                ret = ipath_create_user_egr(pd);
1952        if (ret)
1953                goto done;
1954
1955        /*
1956         * set the eager head register for this port to the current values
1957         * of the tail pointers, since we don't know if they were
1958         * updated on last use of the port.
1959         */
1960        head32 = ipath_read_ureg32(dd, ur_rcvegrindextail, pd->port_port);
1961        ipath_write_ureg(dd, ur_rcvegrindexhead, head32, pd->port_port);
1962        pd->port_lastrcvhdrqtail = -1;
1963        ipath_cdbg(VERBOSE, "Wrote port%d egrhead %x from tail regs\n",
1964                pd->port_port, head32);
1965        pd->port_tidcursor = 0; /* start at beginning after open */
1966
1967        /* initialize poll variables... */
1968        pd->port_urgent = 0;
1969        pd->port_urgent_poll = 0;
1970        pd->port_hdrqfull_poll = pd->port_hdrqfull;
1971
1972        /*
1973         * Now enable the port for receive.
1974         * For chips that are set to DMA the tail register to memory
1975         * when they change (and when the update bit transitions from
1976         * 0 to 1.  So for those chips, we turn it off and then back on.
1977         * This will (very briefly) affect any other open ports, but the
1978         * duration is very short, and therefore isn't an issue.  We
1979         * explicitly set the in-memory tail copy to 0 beforehand, so we
1980         * don't have to wait to be sure the DMA update has happened
1981         * (chip resets head/tail to 0 on transition to enable).
1982         */
1983        set_bit(dd->ipath_r_portenable_shift + pd->port_port,
1984                &dd->ipath_rcvctrl);
1985        if (!(dd->ipath_flags & IPATH_NODMA_RTAIL)) {
1986                if (pd->port_rcvhdrtail_kvaddr)
1987                        ipath_clear_rcvhdrtail(pd);
1988                ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
1989                        dd->ipath_rcvctrl &
1990                        ~(1ULL << dd->ipath_r_tailupd_shift));
1991        }
1992        ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
1993                         dd->ipath_rcvctrl);
1994        /* Notify any waiting slaves */
1995        if (pd->port_subport_cnt) {
1996                clear_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag);
1997                wake_up(&pd->port_wait);
1998        }
1999done:
2000        return ret;
2001}
2002
2003/**
2004 * unlock_exptid - unlock any expected TID entries port still had in use
2005 * @pd: port
2006 *
2007 * We don't actually update the chip here, because we do a bulk update
2008 * below, using ipath_f_clear_tids.
2009 */
2010static void unlock_expected_tids(struct ipath_portdata *pd)
2011{
2012        struct ipath_devdata *dd = pd->port_dd;
2013        int port_tidbase = pd->port_port * dd->ipath_rcvtidcnt;
2014        int i, cnt = 0, maxtid = port_tidbase + dd->ipath_rcvtidcnt;
2015
2016        ipath_cdbg(VERBOSE, "Port %u unlocking any locked expTID pages\n",
2017                   pd->port_port);
2018        for (i = port_tidbase; i < maxtid; i++) {
2019                struct page *ps = dd->ipath_pageshadow[i];
2020
2021                if (!ps)
2022                        continue;
2023
2024                dd->ipath_pageshadow[i] = NULL;
2025                pci_unmap_page(dd->pcidev, dd->ipath_physshadow[i],
2026                        PAGE_SIZE, PCI_DMA_FROMDEVICE);
2027                ipath_release_user_pages_on_close(&ps, 1);
2028                cnt++;
2029                ipath_stats.sps_pageunlocks++;
2030        }
2031        if (cnt)
2032                ipath_cdbg(VERBOSE, "Port %u locked %u expTID entries\n",
2033                           pd->port_port, cnt);
2034
2035        if (ipath_stats.sps_pagelocks || ipath_stats.sps_pageunlocks)
2036                ipath_cdbg(VERBOSE, "%llu pages locked, %llu unlocked\n",
2037                           (unsigned long long) ipath_stats.sps_pagelocks,
2038                           (unsigned long long)
2039                           ipath_stats.sps_pageunlocks);
2040}
2041
2042static int ipath_close(struct inode *in, struct file *fp)
2043{
2044        int ret = 0;
2045        struct ipath_filedata *fd;
2046        struct ipath_portdata *pd;
2047        struct ipath_devdata *dd;
2048        unsigned long flags;
2049        unsigned port;
2050        struct pid *pid;
2051
2052        ipath_cdbg(VERBOSE, "close on dev %lx, private data %p\n",
2053                   (long)in->i_rdev, fp->private_data);
2054
2055        mutex_lock(&ipath_mutex);
2056
2057        fd = fp->private_data;
2058        fp->private_data = NULL;
2059        pd = fd->pd;
2060        if (!pd) {
2061                mutex_unlock(&ipath_mutex);
2062                goto bail;
2063        }
2064
2065        dd = pd->port_dd;
2066
2067        /* drain user sdma queue */
2068        ipath_user_sdma_queue_drain(dd, fd->pq);
2069        ipath_user_sdma_queue_destroy(fd->pq);
2070
2071        if (--pd->port_cnt) {
2072                /*
2073                 * XXX If the master closes the port before the slave(s),
2074                 * revoke the mmap for the eager receive queue so
2075                 * the slave(s) don't wait for receive data forever.
2076                 */
2077                pd->active_slaves &= ~(1 << fd->subport);
2078                put_pid(pd->port_subpid[fd->subport]);
2079                pd->port_subpid[fd->subport] = NULL;
2080                mutex_unlock(&ipath_mutex);
2081                goto bail;
2082        }
2083        /* early; no interrupt users after this */
2084        spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
2085        port = pd->port_port;
2086        dd->ipath_pd[port] = NULL;
2087        pid = pd->port_pid;
2088        pd->port_pid = NULL;
2089        spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
2090
2091        if (pd->port_rcvwait_to || pd->port_piowait_to
2092            || pd->port_rcvnowait || pd->port_pionowait) {
2093                ipath_cdbg(VERBOSE, "port%u, %u rcv, %u pio wait timeo; "
2094                           "%u rcv %u, pio already\n",
2095                           pd->port_port, pd->port_rcvwait_to,
2096                           pd->port_piowait_to, pd->port_rcvnowait,
2097                           pd->port_pionowait);
2098                pd->port_rcvwait_to = pd->port_piowait_to =
2099                        pd->port_rcvnowait = pd->port_pionowait = 0;
2100        }
2101        if (pd->port_flag) {
2102                ipath_cdbg(PROC, "port %u port_flag set: 0x%lx\n",
2103                          pd->port_port, pd->port_flag);
2104                pd->port_flag = 0;
2105        }
2106
2107        if (dd->ipath_kregbase) {
2108                /* atomically clear receive enable port and intr avail. */
2109                clear_bit(dd->ipath_r_portenable_shift + port,
2110                          &dd->ipath_rcvctrl);
2111                clear_bit(pd->port_port + dd->ipath_r_intravail_shift,
2112                          &dd->ipath_rcvctrl);
2113                ipath_write_kreg( dd, dd->ipath_kregs->kr_rcvctrl,
2114                        dd->ipath_rcvctrl);
2115                /* and read back from chip to be sure that nothing
2116                 * else is in flight when we do the rest */
2117                (void)ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
2118
2119                /* clean up the pkeys for this port user */
2120                ipath_clean_part_key(pd, dd);
2121                /*
2122                 * be paranoid, and never write 0's to these, just use an
2123                 * unused part of the port 0 tail page.  Of course,
2124                 * rcvhdraddr points to a large chunk of memory, so this
2125                 * could still trash things, but at least it won't trash
2126                 * page 0, and by disabling the port, it should stop "soon",
2127                 * even if a packet or two is in already in flight after we
2128                 * disabled the port.
2129                 */
2130                ipath_write_kreg_port(dd,
2131                        dd->ipath_kregs->kr_rcvhdrtailaddr, port,
2132                        dd->ipath_dummy_hdrq_phys);
2133                ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr,
2134                        pd->port_port, dd->ipath_dummy_hdrq_phys);
2135
2136                ipath_disarm_piobufs(dd, pd->port_pio_base, pd->port_piocnt);
2137                ipath_chg_pioavailkernel(dd, pd->port_pio_base,
2138                        pd->port_piocnt, 1);
2139
2140                dd->ipath_f_clear_tids(dd, pd->port_port);
2141
2142                if (dd->ipath_pageshadow)
2143                        unlock_expected_tids(pd);
2144                ipath_stats.sps_ports--;
2145                ipath_cdbg(PROC, "%s[%u] closed port %u:%u\n",
2146                           pd->port_comm, pid_nr(pid),
2147                           dd->ipath_unit, port);
2148        }
2149
2150        put_pid(pid);
2151        mutex_unlock(&ipath_mutex);
2152        ipath_free_pddata(dd, pd); /* after releasing the mutex */
2153
2154bail:
2155        kfree(fd);
2156        return ret;
2157}
2158
2159static int ipath_port_info(struct ipath_portdata *pd, u16 subport,
2160                           struct ipath_port_info __user *uinfo)
2161{
2162        struct ipath_port_info info;
2163        int nup;
2164        int ret;
2165        size_t sz;
2166
2167        (void) ipath_count_units(NULL, &nup, NULL);
2168        info.num_active = nup;
2169        info.unit = pd->port_dd->ipath_unit;
2170        info.port = pd->port_port;
2171        info.subport = subport;
2172        /* Don't return new fields if old library opened the port. */
2173        if (ipath_supports_subports(pd->userversion >> 16,
2174                                    pd->userversion & 0xffff)) {
2175                /* Number of user ports available for this device. */
2176                info.num_ports = pd->port_dd->ipath_cfgports - 1;
2177                info.num_subports = pd->port_subport_cnt;
2178                sz = sizeof(info);
2179        } else
2180                sz = sizeof(info) - 2 * sizeof(u16);
2181
2182        if (copy_to_user(uinfo, &info, sz)) {
2183                ret = -EFAULT;
2184                goto bail;
2185        }
2186        ret = 0;
2187
2188bail:
2189        return ret;
2190}
2191
2192static int ipath_get_slave_info(struct ipath_portdata *pd,
2193                                void __user *slave_mask_addr)
2194{
2195        int ret = 0;
2196
2197        if (copy_to_user(slave_mask_addr, &pd->active_slaves, sizeof(u32)))
2198                ret = -EFAULT;
2199        return ret;
2200}
2201
2202static int ipath_sdma_get_inflight(struct ipath_user_sdma_queue *pq,
2203                                   u32 __user *inflightp)
2204{
2205        const u32 val = ipath_user_sdma_inflight_counter(pq);
2206
2207        if (put_user(val, inflightp))
2208                return -EFAULT;
2209
2210        return 0;
2211}
2212
2213static int ipath_sdma_get_complete(struct ipath_devdata *dd,
2214                                   struct ipath_user_sdma_queue *pq,
2215                                   u32 __user *completep)
2216{
2217        u32 val;
2218        int err;
2219
2220        err = ipath_user_sdma_make_progress(dd, pq);
2221        if (err < 0)
2222                return err;
2223
2224        val = ipath_user_sdma_complete_counter(pq);
2225        if (put_user(val, completep))
2226                return -EFAULT;
2227
2228        return 0;
2229}
2230
2231static ssize_t ipath_write(struct file *fp, const char __user *data,
2232                           size_t count, loff_t *off)
2233{
2234        const struct ipath_cmd __user *ucmd;
2235        struct ipath_portdata *pd;
2236        const void __user *src;
2237        size_t consumed, copy;
2238        struct ipath_cmd cmd;
2239        ssize_t ret = 0;
2240        void *dest;
2241
2242        if (count < sizeof(cmd.type)) {
2243                ret = -EINVAL;
2244                goto bail;
2245        }
2246
2247        ucmd = (const struct ipath_cmd __user *) data;
2248
2249        if (copy_from_user(&cmd.type, &ucmd->type, sizeof(cmd.type))) {
2250                ret = -EFAULT;
2251                goto bail;
2252        }
2253
2254        consumed = sizeof(cmd.type);
2255
2256        switch (cmd.type) {
2257        case IPATH_CMD_ASSIGN_PORT:
2258        case __IPATH_CMD_USER_INIT:
2259        case IPATH_CMD_USER_INIT:
2260                copy = sizeof(cmd.cmd.user_info);
2261                dest = &cmd.cmd.user_info;
2262                src = &ucmd->cmd.user_info;
2263                break;
2264        case IPATH_CMD_RECV_CTRL:
2265                copy = sizeof(cmd.cmd.recv_ctrl);
2266                dest = &cmd.cmd.recv_ctrl;
2267                src = &ucmd->cmd.recv_ctrl;
2268                break;
2269        case IPATH_CMD_PORT_INFO:
2270                copy = sizeof(cmd.cmd.port_info);
2271                dest = &cmd.cmd.port_info;
2272                src = &ucmd->cmd.port_info;
2273                break;
2274        case IPATH_CMD_TID_UPDATE:
2275        case IPATH_CMD_TID_FREE:
2276                copy = sizeof(cmd.cmd.tid_info);
2277                dest = &cmd.cmd.tid_info;
2278                src = &ucmd->cmd.tid_info;
2279                break;
2280        case IPATH_CMD_SET_PART_KEY:
2281                copy = sizeof(cmd.cmd.part_key);
2282                dest = &cmd.cmd.part_key;
2283                src = &ucmd->cmd.part_key;
2284                break;
2285        case __IPATH_CMD_SLAVE_INFO:
2286                copy = sizeof(cmd.cmd.slave_mask_addr);
2287                dest = &cmd.cmd.slave_mask_addr;
2288                src = &ucmd->cmd.slave_mask_addr;
2289                break;
2290        case IPATH_CMD_PIOAVAILUPD:     // force an update of PIOAvail reg
2291                copy = 0;
2292                src = NULL;
2293                dest = NULL;
2294                break;
2295        case IPATH_CMD_POLL_TYPE:
2296                copy = sizeof(cmd.cmd.poll_type);
2297                dest = &cmd.cmd.poll_type;
2298                src = &ucmd->cmd.poll_type;
2299                break;
2300        case IPATH_CMD_ARMLAUNCH_CTRL:
2301                copy = sizeof(cmd.cmd.armlaunch_ctrl);
2302                dest = &cmd.cmd.armlaunch_ctrl;
2303                src = &ucmd->cmd.armlaunch_ctrl;
2304                break;
2305        case IPATH_CMD_SDMA_INFLIGHT:
2306                copy = sizeof(cmd.cmd.sdma_inflight);
2307                dest = &cmd.cmd.sdma_inflight;
2308                src = &ucmd->cmd.sdma_inflight;
2309                break;
2310        case IPATH_CMD_SDMA_COMPLETE:
2311                copy = sizeof(cmd.cmd.sdma_complete);
2312                dest = &cmd.cmd.sdma_complete;
2313                src = &ucmd->cmd.sdma_complete;
2314                break;
2315        default:
2316                ret = -EINVAL;
2317                goto bail;
2318        }
2319
2320        if (copy) {
2321                if ((count - consumed) < copy) {
2322                        ret = -EINVAL;
2323                        goto bail;
2324                }
2325
2326                if (copy_from_user(dest, src, copy)) {
2327                        ret = -EFAULT;
2328                        goto bail;
2329                }
2330
2331                consumed += copy;
2332        }
2333
2334        pd = port_fp(fp);
2335        if (!pd && cmd.type != __IPATH_CMD_USER_INIT &&
2336                cmd.type != IPATH_CMD_ASSIGN_PORT) {
2337                ret = -EINVAL;
2338                goto bail;
2339        }
2340
2341        switch (cmd.type) {
2342        case IPATH_CMD_ASSIGN_PORT:
2343                ret = ipath_assign_port(fp, &cmd.cmd.user_info);
2344                if (ret)
2345                        goto bail;
2346                break;
2347        case __IPATH_CMD_USER_INIT:
2348                /* backwards compatibility, get port first */
2349                ret = ipath_assign_port(fp, &cmd.cmd.user_info);
2350                if (ret)
2351                        goto bail;
2352                /* and fall through to current version. */
2353        case IPATH_CMD_USER_INIT:
2354                ret = ipath_do_user_init(fp, &cmd.cmd.user_info);
2355                if (ret)
2356                        goto bail;
2357                ret = ipath_get_base_info(
2358                        fp, (void __user *) (unsigned long)
2359                        cmd.cmd.user_info.spu_base_info,
2360                        cmd.cmd.user_info.spu_base_info_size);
2361                break;
2362        case IPATH_CMD_RECV_CTRL:
2363                ret = ipath_manage_rcvq(pd, subport_fp(fp), cmd.cmd.recv_ctrl);
2364                break;
2365        case IPATH_CMD_PORT_INFO:
2366                ret = ipath_port_info(pd, subport_fp(fp),
2367                                      (struct ipath_port_info __user *)
2368                                      (unsigned long) cmd.cmd.port_info);
2369                break;
2370        case IPATH_CMD_TID_UPDATE:
2371                ret = ipath_tid_update(pd, fp, &cmd.cmd.tid_info);
2372                break;
2373        case IPATH_CMD_TID_FREE:
2374                ret = ipath_tid_free(pd, subport_fp(fp), &cmd.cmd.tid_info);
2375                break;
2376        case IPATH_CMD_SET_PART_KEY:
2377                ret = ipath_set_part_key(pd, cmd.cmd.part_key);
2378                break;
2379        case __IPATH_CMD_SLAVE_INFO:
2380                ret = ipath_get_slave_info(pd,
2381                                           (void __user *) (unsigned long)
2382                                           cmd.cmd.slave_mask_addr);
2383                break;
2384        case IPATH_CMD_PIOAVAILUPD:
2385                ipath_force_pio_avail_update(pd->port_dd);
2386                break;
2387        case IPATH_CMD_POLL_TYPE:
2388                pd->poll_type = cmd.cmd.poll_type;
2389                break;
2390        case IPATH_CMD_ARMLAUNCH_CTRL:
2391                if (cmd.cmd.armlaunch_ctrl)
2392                        ipath_enable_armlaunch(pd->port_dd);
2393                else
2394                        ipath_disable_armlaunch(pd->port_dd);
2395                break;
2396        case IPATH_CMD_SDMA_INFLIGHT:
2397                ret = ipath_sdma_get_inflight(user_sdma_queue_fp(fp),
2398                                              (u32 __user *) (unsigned long)
2399                                              cmd.cmd.sdma_inflight);
2400                break;
2401        case IPATH_CMD_SDMA_COMPLETE:
2402                ret = ipath_sdma_get_complete(pd->port_dd,
2403                                              user_sdma_queue_fp(fp),
2404                                              (u32 __user *) (unsigned long)
2405                                              cmd.cmd.sdma_complete);
2406                break;
2407        }
2408
2409        if (ret >= 0)
2410                ret = consumed;
2411
2412bail:
2413        return ret;
2414}
2415
2416static ssize_t ipath_writev(struct kiocb *iocb, const struct iovec *iov,
2417                            unsigned long dim, loff_t off)
2418{
2419        struct file *filp = iocb->ki_filp;
2420        struct ipath_filedata *fp = filp->private_data;
2421        struct ipath_portdata *pd = port_fp(filp);
2422        struct ipath_user_sdma_queue *pq = fp->pq;
2423
2424        if (!dim)
2425                return -EINVAL;
2426
2427        return ipath_user_sdma_writev(pd->port_dd, pq, iov, dim);
2428}
2429
2430static struct class *ipath_class;
2431
2432static int init_cdev(int minor, char *name, const struct file_operations *fops,
2433                     struct cdev **cdevp, struct device **devp)
2434{
2435        const dev_t dev = MKDEV(IPATH_MAJOR, minor);
2436        struct cdev *cdev = NULL;
2437        struct device *device = NULL;
2438        int ret;
2439
2440        cdev = cdev_alloc();
2441        if (!cdev) {
2442                printk(KERN_ERR IPATH_DRV_NAME
2443                       ": Could not allocate cdev for minor %d, %s\n",
2444                       minor, name);
2445                ret = -ENOMEM;
2446                goto done;
2447        }
2448
2449        cdev->owner = THIS_MODULE;
2450        cdev->ops = fops;
2451        kobject_set_name(&cdev->kobj, name);
2452
2453        ret = cdev_add(cdev, dev, 1);
2454        if (ret < 0) {
2455                printk(KERN_ERR IPATH_DRV_NAME
2456                       ": Could not add cdev for minor %d, %s (err %d)\n",
2457                       minor, name, -ret);
2458                goto err_cdev;
2459        }
2460
2461        device = device_create(ipath_class, NULL, dev, NULL, name);
2462
2463        if (IS_ERR(device)) {
2464                ret = PTR_ERR(device);
2465                printk(KERN_ERR IPATH_DRV_NAME ": Could not create "
2466                       "device for minor %d, %s (err %d)\n",
2467                       minor, name, -ret);
2468                goto err_cdev;
2469        }
2470
2471        goto done;
2472
2473err_cdev:
2474        cdev_del(cdev);
2475        cdev = NULL;
2476
2477done:
2478        if (ret >= 0) {
2479                *cdevp = cdev;
2480                *devp = device;
2481        } else {
2482                *cdevp = NULL;
2483                *devp = NULL;
2484        }
2485
2486        return ret;
2487}
2488
2489int ipath_cdev_init(int minor, char *name, const struct file_operations *fops,
2490                    struct cdev **cdevp, struct device **devp)
2491{
2492        return init_cdev(minor, name, fops, cdevp, devp);
2493}
2494
2495static void cleanup_cdev(struct cdev **cdevp,
2496                         struct device **devp)
2497{
2498        struct device *dev = *devp;
2499
2500        if (dev) {
2501                device_unregister(dev);
2502                *devp = NULL;
2503        }
2504
2505        if (*cdevp) {
2506                cdev_del(*cdevp);
2507                *cdevp = NULL;
2508        }
2509}
2510
2511void ipath_cdev_cleanup(struct cdev **cdevp,
2512                        struct device **devp)
2513{
2514        cleanup_cdev(cdevp, devp);
2515}
2516
2517static struct cdev *wildcard_cdev;
2518static struct device *wildcard_dev;
2519
2520static const dev_t dev = MKDEV(IPATH_MAJOR, 0);
2521
2522static int user_init(void)
2523{
2524        int ret;
2525
2526        ret = register_chrdev_region(dev, IPATH_NMINORS, IPATH_DRV_NAME);
2527        if (ret < 0) {
2528                printk(KERN_ERR IPATH_DRV_NAME ": Could not register "
2529                       "chrdev region (err %d)\n", -ret);
2530                goto done;
2531        }
2532
2533        ipath_class = class_create(THIS_MODULE, IPATH_DRV_NAME);
2534
2535        if (IS_ERR(ipath_class)) {
2536                ret = PTR_ERR(ipath_class);
2537                printk(KERN_ERR IPATH_DRV_NAME ": Could not create "
2538                       "device class (err %d)\n", -ret);
2539                goto bail;
2540        }
2541
2542        goto done;
2543bail:
2544        unregister_chrdev_region(dev, IPATH_NMINORS);
2545done:
2546        return ret;
2547}
2548
2549static void user_cleanup(void)
2550{
2551        if (ipath_class) {
2552                class_destroy(ipath_class);
2553                ipath_class = NULL;
2554        }
2555
2556        unregister_chrdev_region(dev, IPATH_NMINORS);
2557}
2558
2559static atomic_t user_count = ATOMIC_INIT(0);
2560static atomic_t user_setup = ATOMIC_INIT(0);
2561
2562int ipath_user_add(struct ipath_devdata *dd)
2563{
2564        char name[10];
2565        int ret;
2566
2567        if (atomic_inc_return(&user_count) == 1) {
2568                ret = user_init();
2569                if (ret < 0) {
2570                        ipath_dev_err(dd, "Unable to set up user support: "
2571                                      "error %d\n", -ret);
2572                        goto bail;
2573                }
2574                ret = init_cdev(0, "ipath", &ipath_file_ops, &wildcard_cdev,
2575                                &wildcard_dev);
2576                if (ret < 0) {
2577                        ipath_dev_err(dd, "Could not create wildcard "
2578                                      "minor: error %d\n", -ret);
2579                        goto bail_user;
2580                }
2581
2582                atomic_set(&user_setup, 1);
2583        }
2584
2585        snprintf(name, sizeof(name), "ipath%d", dd->ipath_unit);
2586
2587        ret = init_cdev(dd->ipath_unit + 1, name, &ipath_file_ops,
2588                        &dd->user_cdev, &dd->user_dev);
2589        if (ret < 0)
2590                ipath_dev_err(dd, "Could not create user minor %d, %s\n",
2591                              dd->ipath_unit + 1, name);
2592
2593        goto bail;
2594
2595bail_user:
2596        user_cleanup();
2597bail:
2598        return ret;
2599}
2600
2601void ipath_user_remove(struct ipath_devdata *dd)
2602{
2603        cleanup_cdev(&dd->user_cdev, &dd->user_dev);
2604
2605        if (atomic_dec_return(&user_count) == 0) {
2606                if (atomic_read(&user_setup) == 0)
2607                        goto bail;
2608
2609                cleanup_cdev(&wildcard_cdev, &wildcard_dev);
2610                user_cleanup();
2611
2612                atomic_set(&user_setup, 0);
2613        }
2614bail:
2615        return;
2616}
2617