linux/drivers/infiniband/hw/hfi1/file_ops.c
<<
>>
Prefs
   1/*
   2 * Copyright(c) 2015, 2016 Intel Corporation.
   3 *
   4 * This file is provided under a dual BSD/GPLv2 license.  When using or
   5 * redistributing this file, you may do so under either license.
   6 *
   7 * GPL LICENSE SUMMARY
   8 *
   9 * This program is free software; you can redistribute it and/or modify
  10 * it under the terms of version 2 of the GNU General Public License as
  11 * published by the Free Software Foundation.
  12 *
  13 * This program is distributed in the hope that it will be useful, but
  14 * WITHOUT ANY WARRANTY; without even the implied warranty of
  15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16 * General Public License for more details.
  17 *
  18 * BSD LICENSE
  19 *
  20 * Redistribution and use in source and binary forms, with or without
  21 * modification, are permitted provided that the following conditions
  22 * are met:
  23 *
  24 *  - Redistributions of source code must retain the above copyright
  25 *    notice, this list of conditions and the following disclaimer.
  26 *  - Redistributions in binary form must reproduce the above copyright
  27 *    notice, this list of conditions and the following disclaimer in
  28 *    the documentation and/or other materials provided with the
  29 *    distribution.
  30 *  - Neither the name of Intel Corporation nor the names of its
  31 *    contributors may be used to endorse or promote products derived
  32 *    from this software without specific prior written permission.
  33 *
  34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  45 *
  46 */
  47#include <linux/poll.h>
  48#include <linux/cdev.h>
  49#include <linux/vmalloc.h>
  50#include <linux/io.h>
  51#include <linux/sched/mm.h>
  52
  53#include <rdma/ib.h>
  54
  55#include "hfi.h"
  56#include "pio.h"
  57#include "device.h"
  58#include "common.h"
  59#include "trace.h"
  60#include "user_sdma.h"
  61#include "user_exp_rcv.h"
  62#include "aspm.h"
  63#include "mmu_rb.h"
  64
  65#undef pr_fmt
  66#define pr_fmt(fmt) DRIVER_NAME ": " fmt
  67
  68#define SEND_CTXT_HALT_TIMEOUT 1000 /* msecs */
  69
  70/*
  71 * File operation functions
  72 */
  73static int hfi1_file_open(struct inode *, struct file *);
  74static int hfi1_file_close(struct inode *, struct file *);
  75static ssize_t hfi1_write_iter(struct kiocb *, struct iov_iter *);
  76static unsigned int hfi1_poll(struct file *, struct poll_table_struct *);
  77static int hfi1_file_mmap(struct file *, struct vm_area_struct *);
  78
  79static u64 kvirt_to_phys(void *);
  80static int assign_ctxt(struct file *, struct hfi1_user_info *);
  81static int init_subctxts(struct hfi1_ctxtdata *, const struct hfi1_user_info *);
  82static int user_init(struct file *);
  83static int get_ctxt_info(struct file *, void __user *, __u32);
  84static int get_base_info(struct file *, void __user *, __u32);
  85static int setup_ctxt(struct file *);
  86static int setup_subctxt(struct hfi1_ctxtdata *);
  87static int get_user_context(struct file *, struct hfi1_user_info *, int);
  88static int find_shared_ctxt(struct file *, const struct hfi1_user_info *);
  89static int allocate_ctxt(struct file *, struct hfi1_devdata *,
  90                         struct hfi1_user_info *);
  91static unsigned int poll_urgent(struct file *, struct poll_table_struct *);
  92static unsigned int poll_next(struct file *, struct poll_table_struct *);
  93static int user_event_ack(struct hfi1_ctxtdata *, int, unsigned long);
  94static int set_ctxt_pkey(struct hfi1_ctxtdata *, unsigned, u16);
  95static int manage_rcvq(struct hfi1_ctxtdata *, unsigned, int);
  96static int vma_fault(struct vm_fault *);
  97static long hfi1_file_ioctl(struct file *fp, unsigned int cmd,
  98                            unsigned long arg);
  99
 100static const struct file_operations hfi1_file_ops = {
 101        .owner = THIS_MODULE,
 102        .write_iter = hfi1_write_iter,
 103        .open = hfi1_file_open,
 104        .release = hfi1_file_close,
 105        .unlocked_ioctl = hfi1_file_ioctl,
 106        .poll = hfi1_poll,
 107        .mmap = hfi1_file_mmap,
 108        .llseek = noop_llseek,
 109};
 110
 111static struct vm_operations_struct vm_ops = {
 112        .fault = vma_fault,
 113};
 114
 115/*
 116 * Types of memories mapped into user processes' space
 117 */
 118enum mmap_types {
 119        PIO_BUFS = 1,
 120        PIO_BUFS_SOP,
 121        PIO_CRED,
 122        RCV_HDRQ,
 123        RCV_EGRBUF,
 124        UREGS,
 125        EVENTS,
 126        STATUS,
 127        RTAIL,
 128        SUBCTXT_UREGS,
 129        SUBCTXT_RCV_HDRQ,
 130        SUBCTXT_EGRBUF,
 131        SDMA_COMP
 132};
 133
 134/*
 135 * Masks and offsets defining the mmap tokens
 136 */
 137#define HFI1_MMAP_OFFSET_MASK   0xfffULL
 138#define HFI1_MMAP_OFFSET_SHIFT  0
 139#define HFI1_MMAP_SUBCTXT_MASK  0xfULL
 140#define HFI1_MMAP_SUBCTXT_SHIFT 12
 141#define HFI1_MMAP_CTXT_MASK     0xffULL
 142#define HFI1_MMAP_CTXT_SHIFT    16
 143#define HFI1_MMAP_TYPE_MASK     0xfULL
 144#define HFI1_MMAP_TYPE_SHIFT    24
 145#define HFI1_MMAP_MAGIC_MASK    0xffffffffULL
 146#define HFI1_MMAP_MAGIC_SHIFT   32
 147
 148#define HFI1_MMAP_MAGIC         0xdabbad00
 149
 150#define HFI1_MMAP_TOKEN_SET(field, val) \
 151        (((val) & HFI1_MMAP_##field##_MASK) << HFI1_MMAP_##field##_SHIFT)
 152#define HFI1_MMAP_TOKEN_GET(field, token) \
 153        (((token) >> HFI1_MMAP_##field##_SHIFT) & HFI1_MMAP_##field##_MASK)
 154#define HFI1_MMAP_TOKEN(type, ctxt, subctxt, addr)   \
 155        (HFI1_MMAP_TOKEN_SET(MAGIC, HFI1_MMAP_MAGIC) | \
 156        HFI1_MMAP_TOKEN_SET(TYPE, type) | \
 157        HFI1_MMAP_TOKEN_SET(CTXT, ctxt) | \
 158        HFI1_MMAP_TOKEN_SET(SUBCTXT, subctxt) | \
 159        HFI1_MMAP_TOKEN_SET(OFFSET, (offset_in_page(addr))))
 160
 161#define dbg(fmt, ...)                           \
 162        pr_info(fmt, ##__VA_ARGS__)
 163
 164static inline int is_valid_mmap(u64 token)
 165{
 166        return (HFI1_MMAP_TOKEN_GET(MAGIC, token) == HFI1_MMAP_MAGIC);
 167}
 168
 169static int hfi1_file_open(struct inode *inode, struct file *fp)
 170{
 171        struct hfi1_filedata *fd;
 172        struct hfi1_devdata *dd = container_of(inode->i_cdev,
 173                                               struct hfi1_devdata,
 174                                               user_cdev);
 175
 176        if (!atomic_inc_not_zero(&dd->user_refcount))
 177                return -ENXIO;
 178
 179        /* Just take a ref now. Not all opens result in a context assign */
 180        kobject_get(&dd->kobj);
 181
 182        /* The real work is performed later in assign_ctxt() */
 183
 184        fd = kzalloc(sizeof(*fd), GFP_KERNEL);
 185
 186        if (fd) {
 187                fd->rec_cpu_num = -1; /* no cpu affinity by default */
 188                fd->mm = current->mm;
 189                mmgrab(fd->mm);
 190                fp->private_data = fd;
 191        } else {
 192                fp->private_data = NULL;
 193
 194                if (atomic_dec_and_test(&dd->user_refcount))
 195                        complete(&dd->user_comp);
 196
 197                return -ENOMEM;
 198        }
 199
 200        return 0;
 201}
 202
 203static long hfi1_file_ioctl(struct file *fp, unsigned int cmd,
 204                            unsigned long arg)
 205{
 206        struct hfi1_filedata *fd = fp->private_data;
 207        struct hfi1_ctxtdata *uctxt = fd->uctxt;
 208        struct hfi1_user_info uinfo;
 209        struct hfi1_tid_info tinfo;
 210        int ret = 0;
 211        unsigned long addr;
 212        int uval = 0;
 213        unsigned long ul_uval = 0;
 214        u16 uval16 = 0;
 215
 216        hfi1_cdbg(IOCTL, "IOCTL recv: 0x%x", cmd);
 217        if (cmd != HFI1_IOCTL_ASSIGN_CTXT &&
 218            cmd != HFI1_IOCTL_GET_VERS &&
 219            !uctxt)
 220                return -EINVAL;
 221
 222        switch (cmd) {
 223        case HFI1_IOCTL_ASSIGN_CTXT:
 224                if (uctxt)
 225                        return -EINVAL;
 226
 227                if (copy_from_user(&uinfo,
 228                                   (struct hfi1_user_info __user *)arg,
 229                                   sizeof(uinfo)))
 230                        return -EFAULT;
 231
 232                ret = assign_ctxt(fp, &uinfo);
 233                if (ret < 0)
 234                        return ret;
 235                ret = setup_ctxt(fp);
 236                if (ret)
 237                        return ret;
 238                ret = user_init(fp);
 239                break;
 240        case HFI1_IOCTL_CTXT_INFO:
 241                ret = get_ctxt_info(fp, (void __user *)(unsigned long)arg,
 242                                    sizeof(struct hfi1_ctxt_info));
 243                break;
 244        case HFI1_IOCTL_USER_INFO:
 245                ret = get_base_info(fp, (void __user *)(unsigned long)arg,
 246                                    sizeof(struct hfi1_base_info));
 247                break;
 248        case HFI1_IOCTL_CREDIT_UPD:
 249                if (uctxt)
 250                        sc_return_credits(uctxt->sc);
 251                break;
 252
 253        case HFI1_IOCTL_TID_UPDATE:
 254                if (copy_from_user(&tinfo,
 255                                   (struct hfi11_tid_info __user *)arg,
 256                                   sizeof(tinfo)))
 257                        return -EFAULT;
 258
 259                ret = hfi1_user_exp_rcv_setup(fp, &tinfo);
 260                if (!ret) {
 261                        /*
 262                         * Copy the number of tidlist entries we used
 263                         * and the length of the buffer we registered.
 264                         * These fields are adjacent in the structure so
 265                         * we can copy them at the same time.
 266                         */
 267                        addr = arg + offsetof(struct hfi1_tid_info, tidcnt);
 268                        if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
 269                                         sizeof(tinfo.tidcnt) +
 270                                         sizeof(tinfo.length)))
 271                                ret = -EFAULT;
 272                }
 273                break;
 274
 275        case HFI1_IOCTL_TID_FREE:
 276                if (copy_from_user(&tinfo,
 277                                   (struct hfi11_tid_info __user *)arg,
 278                                   sizeof(tinfo)))
 279                        return -EFAULT;
 280
 281                ret = hfi1_user_exp_rcv_clear(fp, &tinfo);
 282                if (ret)
 283                        break;
 284                addr = arg + offsetof(struct hfi1_tid_info, tidcnt);
 285                if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
 286                                 sizeof(tinfo.tidcnt)))
 287                        ret = -EFAULT;
 288                break;
 289
 290        case HFI1_IOCTL_TID_INVAL_READ:
 291                if (copy_from_user(&tinfo,
 292                                   (struct hfi11_tid_info __user *)arg,
 293                                   sizeof(tinfo)))
 294                        return -EFAULT;
 295
 296                ret = hfi1_user_exp_rcv_invalid(fp, &tinfo);
 297                if (ret)
 298                        break;
 299                addr = arg + offsetof(struct hfi1_tid_info, tidcnt);
 300                if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
 301                                 sizeof(tinfo.tidcnt)))
 302                        ret = -EFAULT;
 303                break;
 304
 305        case HFI1_IOCTL_RECV_CTRL:
 306                ret = get_user(uval, (int __user *)arg);
 307                if (ret != 0)
 308                        return -EFAULT;
 309                ret = manage_rcvq(uctxt, fd->subctxt, uval);
 310                break;
 311
 312        case HFI1_IOCTL_POLL_TYPE:
 313                ret = get_user(uval, (int __user *)arg);
 314                if (ret != 0)
 315                        return -EFAULT;
 316                uctxt->poll_type = (typeof(uctxt->poll_type))uval;
 317                break;
 318
 319        case HFI1_IOCTL_ACK_EVENT:
 320                ret = get_user(ul_uval, (unsigned long __user *)arg);
 321                if (ret != 0)
 322                        return -EFAULT;
 323                ret = user_event_ack(uctxt, fd->subctxt, ul_uval);
 324                break;
 325
 326        case HFI1_IOCTL_SET_PKEY:
 327                ret = get_user(uval16, (u16 __user *)arg);
 328                if (ret != 0)
 329                        return -EFAULT;
 330                if (HFI1_CAP_IS_USET(PKEY_CHECK))
 331                        ret = set_ctxt_pkey(uctxt, fd->subctxt, uval16);
 332                else
 333                        return -EPERM;
 334                break;
 335
 336        case HFI1_IOCTL_CTXT_RESET: {
 337                struct send_context *sc;
 338                struct hfi1_devdata *dd;
 339
 340                if (!uctxt || !uctxt->dd || !uctxt->sc)
 341                        return -EINVAL;
 342
 343                /*
 344                 * There is no protection here. User level has to
 345                 * guarantee that no one will be writing to the send
 346                 * context while it is being re-initialized.
 347                 * If user level breaks that guarantee, it will break
 348                 * it's own context and no one else's.
 349                 */
 350                dd = uctxt->dd;
 351                sc = uctxt->sc;
 352                /*
 353                 * Wait until the interrupt handler has marked the
 354                 * context as halted or frozen. Report error if we time
 355                 * out.
 356                 */
 357                wait_event_interruptible_timeout(
 358                        sc->halt_wait, (sc->flags & SCF_HALTED),
 359                        msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT));
 360                if (!(sc->flags & SCF_HALTED))
 361                        return -ENOLCK;
 362
 363                /*
 364                 * If the send context was halted due to a Freeze,
 365                 * wait until the device has been "unfrozen" before
 366                 * resetting the context.
 367                 */
 368                if (sc->flags & SCF_FROZEN) {
 369                        wait_event_interruptible_timeout(
 370                                dd->event_queue,
 371                                !(ACCESS_ONCE(dd->flags) & HFI1_FROZEN),
 372                                msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT));
 373                        if (dd->flags & HFI1_FROZEN)
 374                                return -ENOLCK;
 375
 376                        if (dd->flags & HFI1_FORCED_FREEZE)
 377                                /*
 378                                 * Don't allow context reset if we are into
 379                                 * forced freeze
 380                                 */
 381                                return -ENODEV;
 382
 383                        sc_disable(sc);
 384                        ret = sc_enable(sc);
 385                        hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_ENB,
 386                                     uctxt->ctxt);
 387                } else {
 388                        ret = sc_restart(sc);
 389                }
 390                if (!ret)
 391                        sc_return_credits(sc);
 392                break;
 393        }
 394
 395        case HFI1_IOCTL_GET_VERS:
 396                uval = HFI1_USER_SWVERSION;
 397                if (put_user(uval, (int __user *)arg))
 398                        return -EFAULT;
 399                break;
 400
 401        default:
 402                return -EINVAL;
 403        }
 404
 405        return ret;
 406}
 407
 408static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from)
 409{
 410        struct hfi1_filedata *fd = kiocb->ki_filp->private_data;
 411        struct hfi1_user_sdma_pkt_q *pq = fd->pq;
 412        struct hfi1_user_sdma_comp_q *cq = fd->cq;
 413        int done = 0, reqs = 0;
 414        unsigned long dim = from->nr_segs;
 415
 416        if (!cq || !pq)
 417                return -EIO;
 418
 419        if (!iter_is_iovec(from) || !dim)
 420                return -EINVAL;
 421
 422        hfi1_cdbg(SDMA, "SDMA request from %u:%u (%lu)",
 423                  fd->uctxt->ctxt, fd->subctxt, dim);
 424
 425        if (atomic_read(&pq->n_reqs) == pq->n_max_reqs)
 426                return -ENOSPC;
 427
 428        while (dim) {
 429                int ret;
 430                unsigned long count = 0;
 431
 432                ret = hfi1_user_sdma_process_request(
 433                        kiocb->ki_filp, (struct iovec *)(from->iov + done),
 434                        dim, &count);
 435                if (ret) {
 436                        reqs = ret;
 437                        break;
 438                }
 439                dim -= count;
 440                done += count;
 441                reqs++;
 442        }
 443
 444        return reqs;
 445}
 446
 447static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
 448{
 449        struct hfi1_filedata *fd = fp->private_data;
 450        struct hfi1_ctxtdata *uctxt = fd->uctxt;
 451        struct hfi1_devdata *dd;
 452        unsigned long flags;
 453        u64 token = vma->vm_pgoff << PAGE_SHIFT,
 454                memaddr = 0;
 455        void *memvirt = NULL;
 456        u8 subctxt, mapio = 0, vmf = 0, type;
 457        ssize_t memlen = 0;
 458        int ret = 0;
 459        u16 ctxt;
 460
 461        if (!is_valid_mmap(token) || !uctxt ||
 462            !(vma->vm_flags & VM_SHARED)) {
 463                ret = -EINVAL;
 464                goto done;
 465        }
 466        dd = uctxt->dd;
 467        ctxt = HFI1_MMAP_TOKEN_GET(CTXT, token);
 468        subctxt = HFI1_MMAP_TOKEN_GET(SUBCTXT, token);
 469        type = HFI1_MMAP_TOKEN_GET(TYPE, token);
 470        if (ctxt != uctxt->ctxt || subctxt != fd->subctxt) {
 471                ret = -EINVAL;
 472                goto done;
 473        }
 474
 475        flags = vma->vm_flags;
 476
 477        switch (type) {
 478        case PIO_BUFS:
 479        case PIO_BUFS_SOP:
 480                memaddr = ((dd->physaddr + TXE_PIO_SEND) +
 481                                /* chip pio base */
 482                           (uctxt->sc->hw_context * BIT(16))) +
 483                                /* 64K PIO space / ctxt */
 484                        (type == PIO_BUFS_SOP ?
 485                                (TXE_PIO_SIZE / 2) : 0); /* sop? */
 486                /*
 487                 * Map only the amount allocated to the context, not the
 488                 * entire available context's PIO space.
 489                 */
 490                memlen = PAGE_ALIGN(uctxt->sc->credits * PIO_BLOCK_SIZE);
 491                flags &= ~VM_MAYREAD;
 492                flags |= VM_DONTCOPY | VM_DONTEXPAND;
 493                vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
 494                mapio = 1;
 495                break;
 496        case PIO_CRED:
 497                if (flags & VM_WRITE) {
 498                        ret = -EPERM;
 499                        goto done;
 500                }
 501                /*
 502                 * The credit return location for this context could be on the
 503                 * second or third page allocated for credit returns (if number
 504                 * of enabled contexts > 64 and 128 respectively).
 505                 */
 506                memvirt = dd->cr_base[uctxt->numa_id].va;
 507                memaddr = virt_to_phys(memvirt) +
 508                        (((u64)uctxt->sc->hw_free -
 509                          (u64)dd->cr_base[uctxt->numa_id].va) & PAGE_MASK);
 510                memlen = PAGE_SIZE;
 511                flags &= ~VM_MAYWRITE;
 512                flags |= VM_DONTCOPY | VM_DONTEXPAND;
 513                /*
 514                 * The driver has already allocated memory for credit
 515                 * returns and programmed it into the chip. Has that
 516                 * memory been flagged as non-cached?
 517                 */
 518                /* vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); */
 519                mapio = 1;
 520                break;
 521        case RCV_HDRQ:
 522                memlen = uctxt->rcvhdrq_size;
 523                memvirt = uctxt->rcvhdrq;
 524                break;
 525        case RCV_EGRBUF: {
 526                unsigned long addr;
 527                int i;
 528                /*
 529                 * The RcvEgr buffer need to be handled differently
 530                 * as multiple non-contiguous pages need to be mapped
 531                 * into the user process.
 532                 */
 533                memlen = uctxt->egrbufs.size;
 534                if ((vma->vm_end - vma->vm_start) != memlen) {
 535                        dd_dev_err(dd, "Eager buffer map size invalid (%lu != %lu)\n",
 536                                   (vma->vm_end - vma->vm_start), memlen);
 537                        ret = -EINVAL;
 538                        goto done;
 539                }
 540                if (vma->vm_flags & VM_WRITE) {
 541                        ret = -EPERM;
 542                        goto done;
 543                }
 544                vma->vm_flags &= ~VM_MAYWRITE;
 545                addr = vma->vm_start;
 546                for (i = 0 ; i < uctxt->egrbufs.numbufs; i++) {
 547                        memlen = uctxt->egrbufs.buffers[i].len;
 548                        memvirt = uctxt->egrbufs.buffers[i].addr;
 549                        ret = remap_pfn_range(
 550                                vma, addr,
 551                                /*
 552                                 * virt_to_pfn() does the same, but
 553                                 * it's not available on x86_64
 554                                 * when CONFIG_MMU is enabled.
 555                                 */
 556                                PFN_DOWN(__pa(memvirt)),
 557                                memlen,
 558                                vma->vm_page_prot);
 559                        if (ret < 0)
 560                                goto done;
 561                        addr += memlen;
 562                }
 563                ret = 0;
 564                goto done;
 565        }
 566        case UREGS:
 567                /*
 568                 * Map only the page that contains this context's user
 569                 * registers.
 570                 */
 571                memaddr = (unsigned long)
 572                        (dd->physaddr + RXE_PER_CONTEXT_USER)
 573                        + (uctxt->ctxt * RXE_PER_CONTEXT_SIZE);
 574                /*
 575                 * TidFlow table is on the same page as the rest of the
 576                 * user registers.
 577                 */
 578                memlen = PAGE_SIZE;
 579                flags |= VM_DONTCOPY | VM_DONTEXPAND;
 580                vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 581                mapio = 1;
 582                break;
 583        case EVENTS:
 584                /*
 585                 * Use the page where this context's flags are. User level
 586                 * knows where it's own bitmap is within the page.
 587                 */
 588                memaddr = (unsigned long)(dd->events +
 589                                          ((uctxt->ctxt - dd->first_user_ctxt) *
 590                                           HFI1_MAX_SHARED_CTXTS)) & PAGE_MASK;
 591                memlen = PAGE_SIZE;
 592                /*
 593                 * v3.7 removes VM_RESERVED but the effect is kept by
 594                 * using VM_IO.
 595                 */
 596                flags |= VM_IO | VM_DONTEXPAND;
 597                vmf = 1;
 598                break;
 599        case STATUS:
 600                memaddr = kvirt_to_phys((void *)dd->status);
 601                memlen = PAGE_SIZE;
 602                flags |= VM_IO | VM_DONTEXPAND;
 603                break;
 604        case RTAIL:
 605                if (!HFI1_CAP_IS_USET(DMA_RTAIL)) {
 606                        /*
 607                         * If the memory allocation failed, the context alloc
 608                         * also would have failed, so we would never get here
 609                         */
 610                        ret = -EINVAL;
 611                        goto done;
 612                }
 613                if (flags & VM_WRITE) {
 614                        ret = -EPERM;
 615                        goto done;
 616                }
 617                memlen = PAGE_SIZE;
 618                memvirt = (void *)uctxt->rcvhdrtail_kvaddr;
 619                flags &= ~VM_MAYWRITE;
 620                break;
 621        case SUBCTXT_UREGS:
 622                memaddr = (u64)uctxt->subctxt_uregbase;
 623                memlen = PAGE_SIZE;
 624                flags |= VM_IO | VM_DONTEXPAND;
 625                vmf = 1;
 626                break;
 627        case SUBCTXT_RCV_HDRQ:
 628                memaddr = (u64)uctxt->subctxt_rcvhdr_base;
 629                memlen = uctxt->rcvhdrq_size * uctxt->subctxt_cnt;
 630                flags |= VM_IO | VM_DONTEXPAND;
 631                vmf = 1;
 632                break;
 633        case SUBCTXT_EGRBUF:
 634                memaddr = (u64)uctxt->subctxt_rcvegrbuf;
 635                memlen = uctxt->egrbufs.size * uctxt->subctxt_cnt;
 636                flags |= VM_IO | VM_DONTEXPAND;
 637                flags &= ~VM_MAYWRITE;
 638                vmf = 1;
 639                break;
 640        case SDMA_COMP: {
 641                struct hfi1_user_sdma_comp_q *cq = fd->cq;
 642
 643                if (!cq) {
 644                        ret = -EFAULT;
 645                        goto done;
 646                }
 647                memaddr = (u64)cq->comps;
 648                memlen = PAGE_ALIGN(sizeof(*cq->comps) * cq->nentries);
 649                flags |= VM_IO | VM_DONTEXPAND;
 650                vmf = 1;
 651                break;
 652        }
 653        default:
 654                ret = -EINVAL;
 655                break;
 656        }
 657
 658        if ((vma->vm_end - vma->vm_start) != memlen) {
 659                hfi1_cdbg(PROC, "%u:%u Memory size mismatch %lu:%lu",
 660                          uctxt->ctxt, fd->subctxt,
 661                          (vma->vm_end - vma->vm_start), memlen);
 662                ret = -EINVAL;
 663                goto done;
 664        }
 665
 666        vma->vm_flags = flags;
 667        hfi1_cdbg(PROC,
 668                  "%u:%u type:%u io/vf:%d/%d, addr:0x%llx, len:%lu(%lu), flags:0x%lx\n",
 669                    ctxt, subctxt, type, mapio, vmf, memaddr, memlen,
 670                    vma->vm_end - vma->vm_start, vma->vm_flags);
 671        if (vmf) {
 672                vma->vm_pgoff = PFN_DOWN(memaddr);
 673                vma->vm_ops = &vm_ops;
 674                ret = 0;
 675        } else if (mapio) {
 676                ret = io_remap_pfn_range(vma, vma->vm_start,
 677                                         PFN_DOWN(memaddr),
 678                                         memlen,
 679                                         vma->vm_page_prot);
 680        } else if (memvirt) {
 681                ret = remap_pfn_range(vma, vma->vm_start,
 682                                      PFN_DOWN(__pa(memvirt)),
 683                                      memlen,
 684                                      vma->vm_page_prot);
 685        } else {
 686                ret = remap_pfn_range(vma, vma->vm_start,
 687                                      PFN_DOWN(memaddr),
 688                                      memlen,
 689                                      vma->vm_page_prot);
 690        }
 691done:
 692        return ret;
 693}
 694
 695/*
 696 * Local (non-chip) user memory is not mapped right away but as it is
 697 * accessed by the user-level code.
 698 */
 699static int vma_fault(struct vm_fault *vmf)
 700{
 701        struct page *page;
 702
 703        page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT));
 704        if (!page)
 705                return VM_FAULT_SIGBUS;
 706
 707        get_page(page);
 708        vmf->page = page;
 709
 710        return 0;
 711}
 712
 713static unsigned int hfi1_poll(struct file *fp, struct poll_table_struct *pt)
 714{
 715        struct hfi1_ctxtdata *uctxt;
 716        unsigned pollflag;
 717
 718        uctxt = ((struct hfi1_filedata *)fp->private_data)->uctxt;
 719        if (!uctxt)
 720                pollflag = POLLERR;
 721        else if (uctxt->poll_type == HFI1_POLL_TYPE_URGENT)
 722                pollflag = poll_urgent(fp, pt);
 723        else  if (uctxt->poll_type == HFI1_POLL_TYPE_ANYRCV)
 724                pollflag = poll_next(fp, pt);
 725        else /* invalid */
 726                pollflag = POLLERR;
 727
 728        return pollflag;
 729}
 730
 731static int hfi1_file_close(struct inode *inode, struct file *fp)
 732{
 733        struct hfi1_filedata *fdata = fp->private_data;
 734        struct hfi1_ctxtdata *uctxt = fdata->uctxt;
 735        struct hfi1_devdata *dd = container_of(inode->i_cdev,
 736                                               struct hfi1_devdata,
 737                                               user_cdev);
 738        unsigned long flags, *ev;
 739
 740        fp->private_data = NULL;
 741
 742        if (!uctxt)
 743                goto done;
 744
 745        hfi1_cdbg(PROC, "freeing ctxt %u:%u", uctxt->ctxt, fdata->subctxt);
 746        mutex_lock(&hfi1_mutex);
 747
 748        flush_wc();
 749        /* drain user sdma queue */
 750        hfi1_user_sdma_free_queues(fdata);
 751
 752        /* release the cpu */
 753        hfi1_put_proc_affinity(fdata->rec_cpu_num);
 754
 755        /*
 756         * Clear any left over, unhandled events so the next process that
 757         * gets this context doesn't get confused.
 758         */
 759        ev = dd->events + ((uctxt->ctxt - dd->first_user_ctxt) *
 760                           HFI1_MAX_SHARED_CTXTS) + fdata->subctxt;
 761        *ev = 0;
 762
 763        if (--uctxt->cnt) {
 764                uctxt->active_slaves &= ~(1 << fdata->subctxt);
 765                mutex_unlock(&hfi1_mutex);
 766                goto done;
 767        }
 768
 769        spin_lock_irqsave(&dd->uctxt_lock, flags);
 770        /*
 771         * Disable receive context and interrupt available, reset all
 772         * RcvCtxtCtrl bits to default values.
 773         */
 774        hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS |
 775                     HFI1_RCVCTRL_TIDFLOW_DIS |
 776                     HFI1_RCVCTRL_INTRAVAIL_DIS |
 777                     HFI1_RCVCTRL_TAILUPD_DIS |
 778                     HFI1_RCVCTRL_ONE_PKT_EGR_DIS |
 779                     HFI1_RCVCTRL_NO_RHQ_DROP_DIS |
 780                     HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt->ctxt);
 781        /* Clear the context's J_KEY */
 782        hfi1_clear_ctxt_jkey(dd, uctxt->ctxt);
 783        /*
 784         * Reset context integrity checks to default.
 785         * (writes to CSRs probably belong in chip.c)
 786         */
 787        write_kctxt_csr(dd, uctxt->sc->hw_context, SEND_CTXT_CHECK_ENABLE,
 788                        hfi1_pkt_default_send_ctxt_mask(dd, uctxt->sc->type));
 789        sc_disable(uctxt->sc);
 790        spin_unlock_irqrestore(&dd->uctxt_lock, flags);
 791
 792        dd->rcd[uctxt->ctxt] = NULL;
 793
 794        hfi1_user_exp_rcv_free(fdata);
 795        hfi1_clear_ctxt_pkey(dd, uctxt->ctxt);
 796
 797        uctxt->rcvwait_to = 0;
 798        uctxt->piowait_to = 0;
 799        uctxt->rcvnowait = 0;
 800        uctxt->pionowait = 0;
 801        uctxt->event_flags = 0;
 802
 803        hfi1_stats.sps_ctxts--;
 804        if (++dd->freectxts == dd->num_user_contexts)
 805                aspm_enable_all(dd);
 806        mutex_unlock(&hfi1_mutex);
 807        hfi1_free_ctxtdata(dd, uctxt);
 808done:
 809        mmdrop(fdata->mm);
 810        kobject_put(&dd->kobj);
 811
 812        if (atomic_dec_and_test(&dd->user_refcount))
 813                complete(&dd->user_comp);
 814
 815        kfree(fdata);
 816        return 0;
 817}
 818
 819/*
 820 * Convert kernel *virtual* addresses to physical addresses.
 821 * This is used to vmalloc'ed addresses.
 822 */
 823static u64 kvirt_to_phys(void *addr)
 824{
 825        struct page *page;
 826        u64 paddr = 0;
 827
 828        page = vmalloc_to_page(addr);
 829        if (page)
 830                paddr = page_to_pfn(page) << PAGE_SHIFT;
 831
 832        return paddr;
 833}
 834
 835static int assign_ctxt(struct file *fp, struct hfi1_user_info *uinfo)
 836{
 837        int i_minor, ret = 0;
 838        unsigned int swmajor, swminor;
 839
 840        swmajor = uinfo->userversion >> 16;
 841        if (swmajor != HFI1_USER_SWMAJOR) {
 842                ret = -ENODEV;
 843                goto done;
 844        }
 845
 846        swminor = uinfo->userversion & 0xffff;
 847
 848        mutex_lock(&hfi1_mutex);
 849        /* First, lets check if we need to setup a shared context? */
 850        if (uinfo->subctxt_cnt) {
 851                struct hfi1_filedata *fd = fp->private_data;
 852
 853                ret = find_shared_ctxt(fp, uinfo);
 854                if (ret < 0)
 855                        goto done_unlock;
 856                if (ret) {
 857                        fd->rec_cpu_num =
 858                                hfi1_get_proc_affinity(fd->uctxt->numa_id);
 859                }
 860        }
 861
 862        /*
 863         * We execute the following block if we couldn't find a
 864         * shared context or if context sharing is not required.
 865         */
 866        if (!ret) {
 867                i_minor = iminor(file_inode(fp)) - HFI1_USER_MINOR_BASE;
 868                ret = get_user_context(fp, uinfo, i_minor);
 869        }
 870done_unlock:
 871        mutex_unlock(&hfi1_mutex);
 872done:
 873        return ret;
 874}
 875
 876static int get_user_context(struct file *fp, struct hfi1_user_info *uinfo,
 877                            int devno)
 878{
 879        struct hfi1_devdata *dd = NULL;
 880        int devmax, npresent, nup;
 881
 882        devmax = hfi1_count_units(&npresent, &nup);
 883        if (!npresent)
 884                return -ENXIO;
 885
 886        if (!nup)
 887                return -ENETDOWN;
 888
 889        dd = hfi1_lookup(devno);
 890        if (!dd)
 891                return -ENODEV;
 892        else if (!dd->freectxts)
 893                return -EBUSY;
 894
 895        return allocate_ctxt(fp, dd, uinfo);
 896}
 897
 898static int find_shared_ctxt(struct file *fp,
 899                            const struct hfi1_user_info *uinfo)
 900{
 901        int devmax, ndev, i;
 902        int ret = 0;
 903        struct hfi1_filedata *fd = fp->private_data;
 904
 905        devmax = hfi1_count_units(NULL, NULL);
 906
 907        for (ndev = 0; ndev < devmax; ndev++) {
 908                struct hfi1_devdata *dd = hfi1_lookup(ndev);
 909
 910                if (!(dd && (dd->flags & HFI1_PRESENT) && dd->kregbase))
 911                        continue;
 912                for (i = dd->first_user_ctxt; i < dd->num_rcv_contexts; i++) {
 913                        struct hfi1_ctxtdata *uctxt = dd->rcd[i];
 914
 915                        /* Skip ctxts which are not yet open */
 916                        if (!uctxt || !uctxt->cnt)
 917                                continue;
 918                        /* Skip ctxt if it doesn't match the requested one */
 919                        if (memcmp(uctxt->uuid, uinfo->uuid,
 920                                   sizeof(uctxt->uuid)) ||
 921                            uctxt->jkey != generate_jkey(current_uid()) ||
 922                            uctxt->subctxt_id != uinfo->subctxt_id ||
 923                            uctxt->subctxt_cnt != uinfo->subctxt_cnt)
 924                                continue;
 925
 926                        /* Verify the sharing process matches the master */
 927                        if (uctxt->userversion != uinfo->userversion ||
 928                            uctxt->cnt >= uctxt->subctxt_cnt) {
 929                                ret = -EINVAL;
 930                                goto done;
 931                        }
 932                        fd->uctxt = uctxt;
 933                        fd->subctxt  = uctxt->cnt++;
 934                        uctxt->active_slaves |= 1 << fd->subctxt;
 935                        ret = 1;
 936                        goto done;
 937                }
 938        }
 939
 940done:
 941        return ret;
 942}
 943
 944static int allocate_ctxt(struct file *fp, struct hfi1_devdata *dd,
 945                         struct hfi1_user_info *uinfo)
 946{
 947        struct hfi1_filedata *fd = fp->private_data;
 948        struct hfi1_ctxtdata *uctxt;
 949        unsigned ctxt;
 950        int ret, numa;
 951
 952        if (dd->flags & HFI1_FROZEN) {
 953                /*
 954                 * Pick an error that is unique from all other errors
 955                 * that are returned so the user process knows that
 956                 * it tried to allocate while the SPC was frozen.  It
 957                 * it should be able to retry with success in a short
 958                 * while.
 959                 */
 960                return -EIO;
 961        }
 962
 963        for (ctxt = dd->first_user_ctxt; ctxt < dd->num_rcv_contexts; ctxt++)
 964                if (!dd->rcd[ctxt])
 965                        break;
 966
 967        if (ctxt == dd->num_rcv_contexts)
 968                return -EBUSY;
 969
 970        /*
 971         * If we don't have a NUMA node requested, preference is towards
 972         * device NUMA node.
 973         */
 974        fd->rec_cpu_num = hfi1_get_proc_affinity(dd->node);
 975        if (fd->rec_cpu_num != -1)
 976                numa = cpu_to_node(fd->rec_cpu_num);
 977        else
 978                numa = numa_node_id();
 979        uctxt = hfi1_create_ctxtdata(dd->pport, ctxt, numa);
 980        if (!uctxt) {
 981                dd_dev_err(dd,
 982                           "Unable to allocate ctxtdata memory, failing open\n");
 983                return -ENOMEM;
 984        }
 985        hfi1_cdbg(PROC, "[%u:%u] pid %u assigned to CPU %d (NUMA %u)",
 986                  uctxt->ctxt, fd->subctxt, current->pid, fd->rec_cpu_num,
 987                  uctxt->numa_id);
 988
 989        /*
 990         * Allocate and enable a PIO send context.
 991         */
 992        uctxt->sc = sc_alloc(dd, SC_USER, uctxt->rcvhdrqentsize,
 993                             uctxt->dd->node);
 994        if (!uctxt->sc) {
 995                ret = -ENOMEM;
 996                goto ctxdata_free;
 997        }
 998        hfi1_cdbg(PROC, "allocated send context %u(%u)\n", uctxt->sc->sw_index,
 999                  uctxt->sc->hw_context);
1000        ret = sc_enable(uctxt->sc);
1001        if (ret)
1002                goto ctxdata_free;
1003
1004        /*
1005         * Setup shared context resources if the user-level has requested
1006         * shared contexts and this is the 'master' process.
1007         * This has to be done here so the rest of the sub-contexts find the
1008         * proper master.
1009         */
1010        if (uinfo->subctxt_cnt && !fd->subctxt) {
1011                ret = init_subctxts(uctxt, uinfo);
1012                /*
1013                 * On error, we don't need to disable and de-allocate the
1014                 * send context because it will be done during file close
1015                 */
1016                if (ret)
1017                        goto ctxdata_free;
1018        }
1019        uctxt->userversion = uinfo->userversion;
1020        uctxt->flags = hfi1_cap_mask; /* save current flag state */
1021        init_waitqueue_head(&uctxt->wait);
1022        strlcpy(uctxt->comm, current->comm, sizeof(uctxt->comm));
1023        memcpy(uctxt->uuid, uinfo->uuid, sizeof(uctxt->uuid));
1024        uctxt->jkey = generate_jkey(current_uid());
1025        INIT_LIST_HEAD(&uctxt->sdma_queues);
1026        spin_lock_init(&uctxt->sdma_qlock);
1027        hfi1_stats.sps_ctxts++;
1028        /*
1029         * Disable ASPM when there are open user/PSM contexts to avoid
1030         * issues with ASPM L1 exit latency
1031         */
1032        if (dd->freectxts-- == dd->num_user_contexts)
1033                aspm_disable_all(dd);
1034        fd->uctxt = uctxt;
1035
1036        return 0;
1037
1038ctxdata_free:
1039        dd->rcd[ctxt] = NULL;
1040        hfi1_free_ctxtdata(dd, uctxt);
1041        return ret;
1042}
1043
1044static int init_subctxts(struct hfi1_ctxtdata *uctxt,
1045                         const struct hfi1_user_info *uinfo)
1046{
1047        unsigned num_subctxts;
1048
1049        num_subctxts = uinfo->subctxt_cnt;
1050        if (num_subctxts > HFI1_MAX_SHARED_CTXTS)
1051                return -EINVAL;
1052
1053        uctxt->subctxt_cnt = uinfo->subctxt_cnt;
1054        uctxt->subctxt_id = uinfo->subctxt_id;
1055        uctxt->active_slaves = 1;
1056        uctxt->redirect_seq_cnt = 1;
1057        set_bit(HFI1_CTXT_MASTER_UNINIT, &uctxt->event_flags);
1058
1059        return 0;
1060}
1061
1062static int setup_subctxt(struct hfi1_ctxtdata *uctxt)
1063{
1064        int ret = 0;
1065        unsigned num_subctxts = uctxt->subctxt_cnt;
1066
1067        uctxt->subctxt_uregbase = vmalloc_user(PAGE_SIZE);
1068        if (!uctxt->subctxt_uregbase) {
1069                ret = -ENOMEM;
1070                goto bail;
1071        }
1072        /* We can take the size of the RcvHdr Queue from the master */
1073        uctxt->subctxt_rcvhdr_base = vmalloc_user(uctxt->rcvhdrq_size *
1074                                                  num_subctxts);
1075        if (!uctxt->subctxt_rcvhdr_base) {
1076                ret = -ENOMEM;
1077                goto bail_ureg;
1078        }
1079
1080        uctxt->subctxt_rcvegrbuf = vmalloc_user(uctxt->egrbufs.size *
1081                                                num_subctxts);
1082        if (!uctxt->subctxt_rcvegrbuf) {
1083                ret = -ENOMEM;
1084                goto bail_rhdr;
1085        }
1086        goto bail;
1087bail_rhdr:
1088        vfree(uctxt->subctxt_rcvhdr_base);
1089bail_ureg:
1090        vfree(uctxt->subctxt_uregbase);
1091        uctxt->subctxt_uregbase = NULL;
1092bail:
1093        return ret;
1094}
1095
1096static int user_init(struct file *fp)
1097{
1098        unsigned int rcvctrl_ops = 0;
1099        struct hfi1_filedata *fd = fp->private_data;
1100        struct hfi1_ctxtdata *uctxt = fd->uctxt;
1101
1102        /* make sure that the context has already been setup */
1103        if (!test_bit(HFI1_CTXT_SETUP_DONE, &uctxt->event_flags))
1104                return -EFAULT;
1105
1106        /* initialize poll variables... */
1107        uctxt->urgent = 0;
1108        uctxt->urgent_poll = 0;
1109
1110        /*
1111         * Now enable the ctxt for receive.
1112         * For chips that are set to DMA the tail register to memory
1113         * when they change (and when the update bit transitions from
1114         * 0 to 1.  So for those chips, we turn it off and then back on.
1115         * This will (very briefly) affect any other open ctxts, but the
1116         * duration is very short, and therefore isn't an issue.  We
1117         * explicitly set the in-memory tail copy to 0 beforehand, so we
1118         * don't have to wait to be sure the DMA update has happened
1119         * (chip resets head/tail to 0 on transition to enable).
1120         */
1121        if (uctxt->rcvhdrtail_kvaddr)
1122                clear_rcvhdrtail(uctxt);
1123
1124        /* Setup J_KEY before enabling the context */
1125        hfi1_set_ctxt_jkey(uctxt->dd, uctxt->ctxt, uctxt->jkey);
1126
1127        rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB;
1128        if (HFI1_CAP_UGET_MASK(uctxt->flags, HDRSUPP))
1129                rcvctrl_ops |= HFI1_RCVCTRL_TIDFLOW_ENB;
1130        /*
1131         * Ignore the bit in the flags for now until proper
1132         * support for multiple packet per rcv array entry is
1133         * added.
1134         */
1135        if (!HFI1_CAP_UGET_MASK(uctxt->flags, MULTI_PKT_EGR))
1136                rcvctrl_ops |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB;
1137        if (HFI1_CAP_UGET_MASK(uctxt->flags, NODROP_EGR_FULL))
1138                rcvctrl_ops |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
1139        if (HFI1_CAP_UGET_MASK(uctxt->flags, NODROP_RHQ_FULL))
1140                rcvctrl_ops |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
1141        /*
1142         * The RcvCtxtCtrl.TailUpd bit has to be explicitly written.
1143         * We can't rely on the correct value to be set from prior
1144         * uses of the chip or ctxt. Therefore, add the rcvctrl op
1145         * for both cases.
1146         */
1147        if (HFI1_CAP_UGET_MASK(uctxt->flags, DMA_RTAIL))
1148                rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB;
1149        else
1150                rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_DIS;
1151        hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt->ctxt);
1152
1153        /* Notify any waiting slaves */
1154        if (uctxt->subctxt_cnt) {
1155                clear_bit(HFI1_CTXT_MASTER_UNINIT, &uctxt->event_flags);
1156                wake_up(&uctxt->wait);
1157        }
1158
1159        return 0;
1160}
1161
1162static int get_ctxt_info(struct file *fp, void __user *ubase, __u32 len)
1163{
1164        struct hfi1_ctxt_info cinfo;
1165        struct hfi1_filedata *fd = fp->private_data;
1166        struct hfi1_ctxtdata *uctxt = fd->uctxt;
1167        int ret = 0;
1168
1169        memset(&cinfo, 0, sizeof(cinfo));
1170        cinfo.runtime_flags = (((uctxt->flags >> HFI1_CAP_MISC_SHIFT) &
1171                                HFI1_CAP_MISC_MASK) << HFI1_CAP_USER_SHIFT) |
1172                        HFI1_CAP_UGET_MASK(uctxt->flags, MASK) |
1173                        HFI1_CAP_KGET_MASK(uctxt->flags, K2U);
1174        /* adjust flag if this fd is not able to cache */
1175        if (!fd->handler)
1176                cinfo.runtime_flags |= HFI1_CAP_TID_UNMAP; /* no caching */
1177
1178        cinfo.num_active = hfi1_count_active_units();
1179        cinfo.unit = uctxt->dd->unit;
1180        cinfo.ctxt = uctxt->ctxt;
1181        cinfo.subctxt = fd->subctxt;
1182        cinfo.rcvtids = roundup(uctxt->egrbufs.alloced,
1183                                uctxt->dd->rcv_entries.group_size) +
1184                uctxt->expected_count;
1185        cinfo.credits = uctxt->sc->credits;
1186        cinfo.numa_node = uctxt->numa_id;
1187        cinfo.rec_cpu = fd->rec_cpu_num;
1188        cinfo.send_ctxt = uctxt->sc->hw_context;
1189
1190        cinfo.egrtids = uctxt->egrbufs.alloced;
1191        cinfo.rcvhdrq_cnt = uctxt->rcvhdrq_cnt;
1192        cinfo.rcvhdrq_entsize = uctxt->rcvhdrqentsize << 2;
1193        cinfo.sdma_ring_size = fd->cq->nentries;
1194        cinfo.rcvegr_size = uctxt->egrbufs.rcvtid_size;
1195
1196        trace_hfi1_ctxt_info(uctxt->dd, uctxt->ctxt, fd->subctxt, cinfo);
1197        if (copy_to_user(ubase, &cinfo, sizeof(cinfo)))
1198                ret = -EFAULT;
1199
1200        return ret;
1201}
1202
1203static int setup_ctxt(struct file *fp)
1204{
1205        struct hfi1_filedata *fd = fp->private_data;
1206        struct hfi1_ctxtdata *uctxt = fd->uctxt;
1207        struct hfi1_devdata *dd = uctxt->dd;
1208        int ret = 0;
1209
1210        /*
1211         * Context should be set up only once, including allocation and
1212         * programming of eager buffers. This is done if context sharing
1213         * is not requested or by the master process.
1214         */
1215        if (!uctxt->subctxt_cnt || !fd->subctxt) {
1216                ret = hfi1_init_ctxt(uctxt->sc);
1217                if (ret)
1218                        goto done;
1219
1220                /* Now allocate the RcvHdr queue and eager buffers. */
1221                ret = hfi1_create_rcvhdrq(dd, uctxt);
1222                if (ret)
1223                        goto done;
1224                ret = hfi1_setup_eagerbufs(uctxt);
1225                if (ret)
1226                        goto done;
1227                if (uctxt->subctxt_cnt && !fd->subctxt) {
1228                        ret = setup_subctxt(uctxt);
1229                        if (ret)
1230                                goto done;
1231                }
1232        } else {
1233                ret = wait_event_interruptible(uctxt->wait, !test_bit(
1234                                               HFI1_CTXT_MASTER_UNINIT,
1235                                               &uctxt->event_flags));
1236                if (ret)
1237                        goto done;
1238        }
1239
1240        ret = hfi1_user_sdma_alloc_queues(uctxt, fp);
1241        if (ret)
1242                goto done;
1243        /*
1244         * Expected receive has to be setup for all processes (including
1245         * shared contexts). However, it has to be done after the master
1246         * context has been fully configured as it depends on the
1247         * eager/expected split of the RcvArray entries.
1248         * Setting it up here ensures that the subcontexts will be waiting
1249         * (due to the above wait_event_interruptible() until the master
1250         * is setup.
1251         */
1252        ret = hfi1_user_exp_rcv_init(fp);
1253        if (ret)
1254                goto done;
1255
1256        set_bit(HFI1_CTXT_SETUP_DONE, &uctxt->event_flags);
1257done:
1258        return ret;
1259}
1260
1261static int get_base_info(struct file *fp, void __user *ubase, __u32 len)
1262{
1263        struct hfi1_base_info binfo;
1264        struct hfi1_filedata *fd = fp->private_data;
1265        struct hfi1_ctxtdata *uctxt = fd->uctxt;
1266        struct hfi1_devdata *dd = uctxt->dd;
1267        ssize_t sz;
1268        unsigned offset;
1269        int ret = 0;
1270
1271        trace_hfi1_uctxtdata(uctxt->dd, uctxt);
1272
1273        memset(&binfo, 0, sizeof(binfo));
1274        binfo.hw_version = dd->revision;
1275        binfo.sw_version = HFI1_KERN_SWVERSION;
1276        binfo.bthqp = kdeth_qp;
1277        binfo.jkey = uctxt->jkey;
1278        /*
1279         * If more than 64 contexts are enabled the allocated credit
1280         * return will span two or three contiguous pages. Since we only
1281         * map the page containing the context's credit return address,
1282         * we need to calculate the offset in the proper page.
1283         */
1284        offset = ((u64)uctxt->sc->hw_free -
1285                  (u64)dd->cr_base[uctxt->numa_id].va) % PAGE_SIZE;
1286        binfo.sc_credits_addr = HFI1_MMAP_TOKEN(PIO_CRED, uctxt->ctxt,
1287                                                fd->subctxt, offset);
1288        binfo.pio_bufbase = HFI1_MMAP_TOKEN(PIO_BUFS, uctxt->ctxt,
1289                                            fd->subctxt,
1290                                            uctxt->sc->base_addr);
1291        binfo.pio_bufbase_sop = HFI1_MMAP_TOKEN(PIO_BUFS_SOP,
1292                                                uctxt->ctxt,
1293                                                fd->subctxt,
1294                                                uctxt->sc->base_addr);
1295        binfo.rcvhdr_bufbase = HFI1_MMAP_TOKEN(RCV_HDRQ, uctxt->ctxt,
1296                                               fd->subctxt,
1297                                               uctxt->rcvhdrq);
1298        binfo.rcvegr_bufbase = HFI1_MMAP_TOKEN(RCV_EGRBUF, uctxt->ctxt,
1299                                               fd->subctxt,
1300                                               uctxt->egrbufs.rcvtids[0].dma);
1301        binfo.sdma_comp_bufbase = HFI1_MMAP_TOKEN(SDMA_COMP, uctxt->ctxt,
1302                                                 fd->subctxt, 0);
1303        /*
1304         * user regs are at
1305         * (RXE_PER_CONTEXT_USER + (ctxt * RXE_PER_CONTEXT_SIZE))
1306         */
1307        binfo.user_regbase = HFI1_MMAP_TOKEN(UREGS, uctxt->ctxt,
1308                                            fd->subctxt, 0);
1309        offset = offset_in_page((((uctxt->ctxt - dd->first_user_ctxt) *
1310                    HFI1_MAX_SHARED_CTXTS) + fd->subctxt) *
1311                  sizeof(*dd->events));
1312        binfo.events_bufbase = HFI1_MMAP_TOKEN(EVENTS, uctxt->ctxt,
1313                                              fd->subctxt,
1314                                              offset);
1315        binfo.status_bufbase = HFI1_MMAP_TOKEN(STATUS, uctxt->ctxt,
1316                                              fd->subctxt,
1317                                              dd->status);
1318        if (HFI1_CAP_IS_USET(DMA_RTAIL))
1319                binfo.rcvhdrtail_base = HFI1_MMAP_TOKEN(RTAIL, uctxt->ctxt,
1320                                                       fd->subctxt, 0);
1321        if (uctxt->subctxt_cnt) {
1322                binfo.subctxt_uregbase = HFI1_MMAP_TOKEN(SUBCTXT_UREGS,
1323                                                        uctxt->ctxt,
1324                                                        fd->subctxt, 0);
1325                binfo.subctxt_rcvhdrbuf = HFI1_MMAP_TOKEN(SUBCTXT_RCV_HDRQ,
1326                                                         uctxt->ctxt,
1327                                                         fd->subctxt, 0);
1328                binfo.subctxt_rcvegrbuf = HFI1_MMAP_TOKEN(SUBCTXT_EGRBUF,
1329                                                         uctxt->ctxt,
1330                                                         fd->subctxt, 0);
1331        }
1332        sz = (len < sizeof(binfo)) ? len : sizeof(binfo);
1333        if (copy_to_user(ubase, &binfo, sz))
1334                ret = -EFAULT;
1335        return ret;
1336}
1337
1338static unsigned int poll_urgent(struct file *fp,
1339                                struct poll_table_struct *pt)
1340{
1341        struct hfi1_filedata *fd = fp->private_data;
1342        struct hfi1_ctxtdata *uctxt = fd->uctxt;
1343        struct hfi1_devdata *dd = uctxt->dd;
1344        unsigned pollflag;
1345
1346        poll_wait(fp, &uctxt->wait, pt);
1347
1348        spin_lock_irq(&dd->uctxt_lock);
1349        if (uctxt->urgent != uctxt->urgent_poll) {
1350                pollflag = POLLIN | POLLRDNORM;
1351                uctxt->urgent_poll = uctxt->urgent;
1352        } else {
1353                pollflag = 0;
1354                set_bit(HFI1_CTXT_WAITING_URG, &uctxt->event_flags);
1355        }
1356        spin_unlock_irq(&dd->uctxt_lock);
1357
1358        return pollflag;
1359}
1360
1361static unsigned int poll_next(struct file *fp,
1362                              struct poll_table_struct *pt)
1363{
1364        struct hfi1_filedata *fd = fp->private_data;
1365        struct hfi1_ctxtdata *uctxt = fd->uctxt;
1366        struct hfi1_devdata *dd = uctxt->dd;
1367        unsigned pollflag;
1368
1369        poll_wait(fp, &uctxt->wait, pt);
1370
1371        spin_lock_irq(&dd->uctxt_lock);
1372        if (hdrqempty(uctxt)) {
1373                set_bit(HFI1_CTXT_WAITING_RCV, &uctxt->event_flags);
1374                hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_ENB, uctxt->ctxt);
1375                pollflag = 0;
1376        } else {
1377                pollflag = POLLIN | POLLRDNORM;
1378        }
1379        spin_unlock_irq(&dd->uctxt_lock);
1380
1381        return pollflag;
1382}
1383
1384/*
1385 * Find all user contexts in use, and set the specified bit in their
1386 * event mask.
1387 * See also find_ctxt() for a similar use, that is specific to send buffers.
1388 */
1389int hfi1_set_uevent_bits(struct hfi1_pportdata *ppd, const int evtbit)
1390{
1391        struct hfi1_ctxtdata *uctxt;
1392        struct hfi1_devdata *dd = ppd->dd;
1393        unsigned ctxt;
1394        int ret = 0;
1395        unsigned long flags;
1396
1397        if (!dd->events) {
1398                ret = -EINVAL;
1399                goto done;
1400        }
1401
1402        spin_lock_irqsave(&dd->uctxt_lock, flags);
1403        for (ctxt = dd->first_user_ctxt; ctxt < dd->num_rcv_contexts;
1404             ctxt++) {
1405                uctxt = dd->rcd[ctxt];
1406                if (uctxt) {
1407                        unsigned long *evs = dd->events +
1408                                (uctxt->ctxt - dd->first_user_ctxt) *
1409                                HFI1_MAX_SHARED_CTXTS;
1410                        int i;
1411                        /*
1412                         * subctxt_cnt is 0 if not shared, so do base
1413                         * separately, first, then remaining subctxt, if any
1414                         */
1415                        set_bit(evtbit, evs);
1416                        for (i = 1; i < uctxt->subctxt_cnt; i++)
1417                                set_bit(evtbit, evs + i);
1418                }
1419        }
1420        spin_unlock_irqrestore(&dd->uctxt_lock, flags);
1421done:
1422        return ret;
1423}
1424
1425/**
1426 * manage_rcvq - manage a context's receive queue
1427 * @uctxt: the context
1428 * @subctxt: the sub-context
1429 * @start_stop: action to carry out
1430 *
1431 * start_stop == 0 disables receive on the context, for use in queue
1432 * overflow conditions.  start_stop==1 re-enables, to be used to
1433 * re-init the software copy of the head register
1434 */
1435static int manage_rcvq(struct hfi1_ctxtdata *uctxt, unsigned subctxt,
1436                       int start_stop)
1437{
1438        struct hfi1_devdata *dd = uctxt->dd;
1439        unsigned int rcvctrl_op;
1440
1441        if (subctxt)
1442                goto bail;
1443        /* atomically clear receive enable ctxt. */
1444        if (start_stop) {
1445                /*
1446                 * On enable, force in-memory copy of the tail register to
1447                 * 0, so that protocol code doesn't have to worry about
1448                 * whether or not the chip has yet updated the in-memory
1449                 * copy or not on return from the system call. The chip
1450                 * always resets it's tail register back to 0 on a
1451                 * transition from disabled to enabled.
1452                 */
1453                if (uctxt->rcvhdrtail_kvaddr)
1454                        clear_rcvhdrtail(uctxt);
1455                rcvctrl_op = HFI1_RCVCTRL_CTXT_ENB;
1456        } else {
1457                rcvctrl_op = HFI1_RCVCTRL_CTXT_DIS;
1458        }
1459        hfi1_rcvctrl(dd, rcvctrl_op, uctxt->ctxt);
1460        /* always; new head should be equal to new tail; see above */
1461bail:
1462        return 0;
1463}
1464
1465/*
1466 * clear the event notifier events for this context.
1467 * User process then performs actions appropriate to bit having been
1468 * set, if desired, and checks again in future.
1469 */
1470static int user_event_ack(struct hfi1_ctxtdata *uctxt, int subctxt,
1471                          unsigned long events)
1472{
1473        int i;
1474        struct hfi1_devdata *dd = uctxt->dd;
1475        unsigned long *evs;
1476
1477        if (!dd->events)
1478                return 0;
1479
1480        evs = dd->events + ((uctxt->ctxt - dd->first_user_ctxt) *
1481                            HFI1_MAX_SHARED_CTXTS) + subctxt;
1482
1483        for (i = 0; i <= _HFI1_MAX_EVENT_BIT; i++) {
1484                if (!test_bit(i, &events))
1485                        continue;
1486                clear_bit(i, evs);
1487        }
1488        return 0;
1489}
1490
1491static int set_ctxt_pkey(struct hfi1_ctxtdata *uctxt, unsigned subctxt,
1492                         u16 pkey)
1493{
1494        int ret = -ENOENT, i, intable = 0;
1495        struct hfi1_pportdata *ppd = uctxt->ppd;
1496        struct hfi1_devdata *dd = uctxt->dd;
1497
1498        if (pkey == LIM_MGMT_P_KEY || pkey == FULL_MGMT_P_KEY) {
1499                ret = -EINVAL;
1500                goto done;
1501        }
1502
1503        for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++)
1504                if (pkey == ppd->pkeys[i]) {
1505                        intable = 1;
1506                        break;
1507                }
1508
1509        if (intable)
1510                ret = hfi1_set_ctxt_pkey(dd, uctxt->ctxt, pkey);
1511done:
1512        return ret;
1513}
1514
1515static void user_remove(struct hfi1_devdata *dd)
1516{
1517
1518        hfi1_cdev_cleanup(&dd->user_cdev, &dd->user_device);
1519}
1520
1521static int user_add(struct hfi1_devdata *dd)
1522{
1523        char name[10];
1524        int ret;
1525
1526        snprintf(name, sizeof(name), "%s_%d", class_name(), dd->unit);
1527        ret = hfi1_cdev_init(dd->unit, name, &hfi1_file_ops,
1528                             &dd->user_cdev, &dd->user_device,
1529                             true, &dd->kobj);
1530        if (ret)
1531                user_remove(dd);
1532
1533        return ret;
1534}
1535
1536/*
1537 * Create per-unit files in /dev
1538 */
1539int hfi1_device_create(struct hfi1_devdata *dd)
1540{
1541        return user_add(dd);
1542}
1543
1544/*
1545 * Remove per-unit files in /dev
1546 * void, core kernel returns no errors for this stuff
1547 */
1548void hfi1_device_remove(struct hfi1_devdata *dd)
1549{
1550        user_remove(dd);
1551}
1552