linux/drivers/infiniband/hw/hfi1/file_ops.c
<<
>>
Prefs
   1/*
   2 * Copyright(c) 2015-2017 Intel Corporation.
   3 *
   4 * This file is provided under a dual BSD/GPLv2 license.  When using or
   5 * redistributing this file, you may do so under either license.
   6 *
   7 * GPL LICENSE SUMMARY
   8 *
   9 * This program is free software; you can redistribute it and/or modify
  10 * it under the terms of version 2 of the GNU General Public License as
  11 * published by the Free Software Foundation.
  12 *
  13 * This program is distributed in the hope that it will be useful, but
  14 * WITHOUT ANY WARRANTY; without even the implied warranty of
  15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16 * General Public License for more details.
  17 *
  18 * BSD LICENSE
  19 *
  20 * Redistribution and use in source and binary forms, with or without
  21 * modification, are permitted provided that the following conditions
  22 * are met:
  23 *
  24 *  - Redistributions of source code must retain the above copyright
  25 *    notice, this list of conditions and the following disclaimer.
  26 *  - Redistributions in binary form must reproduce the above copyright
  27 *    notice, this list of conditions and the following disclaimer in
  28 *    the documentation and/or other materials provided with the
  29 *    distribution.
  30 *  - Neither the name of Intel Corporation nor the names of its
  31 *    contributors may be used to endorse or promote products derived
  32 *    from this software without specific prior written permission.
  33 *
  34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  45 *
  46 */
  47#include <linux/poll.h>
  48#include <linux/cdev.h>
  49#include <linux/vmalloc.h>
  50#include <linux/io.h>
  51#include <linux/sched/mm.h>
  52#include <linux/bitmap.h>
  53
  54#include <rdma/ib.h>
  55
  56#include "hfi.h"
  57#include "pio.h"
  58#include "device.h"
  59#include "common.h"
  60#include "trace.h"
  61#include "mmu_rb.h"
  62#include "user_sdma.h"
  63#include "user_exp_rcv.h"
  64#include "aspm.h"
  65
  66#undef pr_fmt
  67#define pr_fmt(fmt) DRIVER_NAME ": " fmt
  68
  69#define SEND_CTXT_HALT_TIMEOUT 1000 /* msecs */
  70
  71/*
  72 * File operation functions
  73 */
  74static int hfi1_file_open(struct inode *inode, struct file *fp);
  75static int hfi1_file_close(struct inode *inode, struct file *fp);
  76static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from);
  77static __poll_t hfi1_poll(struct file *fp, struct poll_table_struct *pt);
  78static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma);
  79
  80static u64 kvirt_to_phys(void *addr);
  81static int assign_ctxt(struct hfi1_filedata *fd, unsigned long arg, u32 len);
  82static void init_subctxts(struct hfi1_ctxtdata *uctxt,
  83                          const struct hfi1_user_info *uinfo);
  84static int init_user_ctxt(struct hfi1_filedata *fd,
  85                          struct hfi1_ctxtdata *uctxt);
  86static void user_init(struct hfi1_ctxtdata *uctxt);
  87static int get_ctxt_info(struct hfi1_filedata *fd, unsigned long arg, u32 len);
  88static int get_base_info(struct hfi1_filedata *fd, unsigned long arg, u32 len);
  89static int user_exp_rcv_setup(struct hfi1_filedata *fd, unsigned long arg,
  90                              u32 len);
  91static int user_exp_rcv_clear(struct hfi1_filedata *fd, unsigned long arg,
  92                              u32 len);
  93static int user_exp_rcv_invalid(struct hfi1_filedata *fd, unsigned long arg,
  94                                u32 len);
  95static int setup_base_ctxt(struct hfi1_filedata *fd,
  96                           struct hfi1_ctxtdata *uctxt);
  97static int setup_subctxt(struct hfi1_ctxtdata *uctxt);
  98
  99static int find_sub_ctxt(struct hfi1_filedata *fd,
 100                         const struct hfi1_user_info *uinfo);
 101static int allocate_ctxt(struct hfi1_filedata *fd, struct hfi1_devdata *dd,
 102                         struct hfi1_user_info *uinfo,
 103                         struct hfi1_ctxtdata **cd);
 104static void deallocate_ctxt(struct hfi1_ctxtdata *uctxt);
 105static __poll_t poll_urgent(struct file *fp, struct poll_table_struct *pt);
 106static __poll_t poll_next(struct file *fp, struct poll_table_struct *pt);
 107static int user_event_ack(struct hfi1_ctxtdata *uctxt, u16 subctxt,
 108                          unsigned long arg);
 109static int set_ctxt_pkey(struct hfi1_ctxtdata *uctxt, unsigned long arg);
 110static int ctxt_reset(struct hfi1_ctxtdata *uctxt);
 111static int manage_rcvq(struct hfi1_ctxtdata *uctxt, u16 subctxt,
 112                       unsigned long arg);
 113static int vma_fault(struct vm_fault *vmf);
 114static long hfi1_file_ioctl(struct file *fp, unsigned int cmd,
 115                            unsigned long arg);
 116
 117static const struct file_operations hfi1_file_ops = {
 118        .owner = THIS_MODULE,
 119        .write_iter = hfi1_write_iter,
 120        .open = hfi1_file_open,
 121        .release = hfi1_file_close,
 122        .unlocked_ioctl = hfi1_file_ioctl,
 123        .poll = hfi1_poll,
 124        .mmap = hfi1_file_mmap,
 125        .llseek = noop_llseek,
 126};
 127
 128static const struct vm_operations_struct vm_ops = {
 129        .fault = vma_fault,
 130};
 131
 132/*
 133 * Types of memories mapped into user processes' space
 134 */
 135enum mmap_types {
 136        PIO_BUFS = 1,
 137        PIO_BUFS_SOP,
 138        PIO_CRED,
 139        RCV_HDRQ,
 140        RCV_EGRBUF,
 141        UREGS,
 142        EVENTS,
 143        STATUS,
 144        RTAIL,
 145        SUBCTXT_UREGS,
 146        SUBCTXT_RCV_HDRQ,
 147        SUBCTXT_EGRBUF,
 148        SDMA_COMP
 149};
 150
 151/*
 152 * Masks and offsets defining the mmap tokens
 153 */
 154#define HFI1_MMAP_OFFSET_MASK   0xfffULL
 155#define HFI1_MMAP_OFFSET_SHIFT  0
 156#define HFI1_MMAP_SUBCTXT_MASK  0xfULL
 157#define HFI1_MMAP_SUBCTXT_SHIFT 12
 158#define HFI1_MMAP_CTXT_MASK     0xffULL
 159#define HFI1_MMAP_CTXT_SHIFT    16
 160#define HFI1_MMAP_TYPE_MASK     0xfULL
 161#define HFI1_MMAP_TYPE_SHIFT    24
 162#define HFI1_MMAP_MAGIC_MASK    0xffffffffULL
 163#define HFI1_MMAP_MAGIC_SHIFT   32
 164
 165#define HFI1_MMAP_MAGIC         0xdabbad00
 166
 167#define HFI1_MMAP_TOKEN_SET(field, val) \
 168        (((val) & HFI1_MMAP_##field##_MASK) << HFI1_MMAP_##field##_SHIFT)
 169#define HFI1_MMAP_TOKEN_GET(field, token) \
 170        (((token) >> HFI1_MMAP_##field##_SHIFT) & HFI1_MMAP_##field##_MASK)
 171#define HFI1_MMAP_TOKEN(type, ctxt, subctxt, addr)   \
 172        (HFI1_MMAP_TOKEN_SET(MAGIC, HFI1_MMAP_MAGIC) | \
 173        HFI1_MMAP_TOKEN_SET(TYPE, type) | \
 174        HFI1_MMAP_TOKEN_SET(CTXT, ctxt) | \
 175        HFI1_MMAP_TOKEN_SET(SUBCTXT, subctxt) | \
 176        HFI1_MMAP_TOKEN_SET(OFFSET, (offset_in_page(addr))))
 177
 178#define dbg(fmt, ...)                           \
 179        pr_info(fmt, ##__VA_ARGS__)
 180
 181static inline int is_valid_mmap(u64 token)
 182{
 183        return (HFI1_MMAP_TOKEN_GET(MAGIC, token) == HFI1_MMAP_MAGIC);
 184}
 185
 186static int hfi1_file_open(struct inode *inode, struct file *fp)
 187{
 188        struct hfi1_filedata *fd;
 189        struct hfi1_devdata *dd = container_of(inode->i_cdev,
 190                                               struct hfi1_devdata,
 191                                               user_cdev);
 192
 193        if (!((dd->flags & HFI1_PRESENT) && dd->kregbase1))
 194                return -EINVAL;
 195
 196        if (!atomic_inc_not_zero(&dd->user_refcount))
 197                return -ENXIO;
 198
 199        /* The real work is performed later in assign_ctxt() */
 200
 201        fd = kzalloc(sizeof(*fd), GFP_KERNEL);
 202
 203        if (fd) {
 204                fd->rec_cpu_num = -1; /* no cpu affinity by default */
 205                fd->mm = current->mm;
 206                mmgrab(fd->mm);
 207                fd->dd = dd;
 208                kobject_get(&fd->dd->kobj);
 209                fp->private_data = fd;
 210        } else {
 211                fp->private_data = NULL;
 212
 213                if (atomic_dec_and_test(&dd->user_refcount))
 214                        complete(&dd->user_comp);
 215
 216                return -ENOMEM;
 217        }
 218
 219        return 0;
 220}
 221
 222static long hfi1_file_ioctl(struct file *fp, unsigned int cmd,
 223                            unsigned long arg)
 224{
 225        struct hfi1_filedata *fd = fp->private_data;
 226        struct hfi1_ctxtdata *uctxt = fd->uctxt;
 227        int ret = 0;
 228        int uval = 0;
 229
 230        hfi1_cdbg(IOCTL, "IOCTL recv: 0x%x", cmd);
 231        if (cmd != HFI1_IOCTL_ASSIGN_CTXT &&
 232            cmd != HFI1_IOCTL_GET_VERS &&
 233            !uctxt)
 234                return -EINVAL;
 235
 236        switch (cmd) {
 237        case HFI1_IOCTL_ASSIGN_CTXT:
 238                ret = assign_ctxt(fd, arg, _IOC_SIZE(cmd));
 239                break;
 240
 241        case HFI1_IOCTL_CTXT_INFO:
 242                ret = get_ctxt_info(fd, arg, _IOC_SIZE(cmd));
 243                break;
 244
 245        case HFI1_IOCTL_USER_INFO:
 246                ret = get_base_info(fd, arg, _IOC_SIZE(cmd));
 247                break;
 248
 249        case HFI1_IOCTL_CREDIT_UPD:
 250                if (uctxt)
 251                        sc_return_credits(uctxt->sc);
 252                break;
 253
 254        case HFI1_IOCTL_TID_UPDATE:
 255                ret = user_exp_rcv_setup(fd, arg, _IOC_SIZE(cmd));
 256                break;
 257
 258        case HFI1_IOCTL_TID_FREE:
 259                ret = user_exp_rcv_clear(fd, arg, _IOC_SIZE(cmd));
 260                break;
 261
 262        case HFI1_IOCTL_TID_INVAL_READ:
 263                ret = user_exp_rcv_invalid(fd, arg, _IOC_SIZE(cmd));
 264                break;
 265
 266        case HFI1_IOCTL_RECV_CTRL:
 267                ret = manage_rcvq(uctxt, fd->subctxt, arg);
 268                break;
 269
 270        case HFI1_IOCTL_POLL_TYPE:
 271                if (get_user(uval, (int __user *)arg))
 272                        return -EFAULT;
 273                uctxt->poll_type = (typeof(uctxt->poll_type))uval;
 274                break;
 275
 276        case HFI1_IOCTL_ACK_EVENT:
 277                ret = user_event_ack(uctxt, fd->subctxt, arg);
 278                break;
 279
 280        case HFI1_IOCTL_SET_PKEY:
 281                ret = set_ctxt_pkey(uctxt, arg);
 282                break;
 283
 284        case HFI1_IOCTL_CTXT_RESET:
 285                ret = ctxt_reset(uctxt);
 286                break;
 287
 288        case HFI1_IOCTL_GET_VERS:
 289                uval = HFI1_USER_SWVERSION;
 290                if (put_user(uval, (int __user *)arg))
 291                        return -EFAULT;
 292                break;
 293
 294        default:
 295                return -EINVAL;
 296        }
 297
 298        return ret;
 299}
 300
 301static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from)
 302{
 303        struct hfi1_filedata *fd = kiocb->ki_filp->private_data;
 304        struct hfi1_user_sdma_pkt_q *pq = fd->pq;
 305        struct hfi1_user_sdma_comp_q *cq = fd->cq;
 306        int done = 0, reqs = 0;
 307        unsigned long dim = from->nr_segs;
 308
 309        if (!cq || !pq)
 310                return -EIO;
 311
 312        if (!iter_is_iovec(from) || !dim)
 313                return -EINVAL;
 314
 315        trace_hfi1_sdma_request(fd->dd, fd->uctxt->ctxt, fd->subctxt, dim);
 316
 317        if (atomic_read(&pq->n_reqs) == pq->n_max_reqs)
 318                return -ENOSPC;
 319
 320        while (dim) {
 321                int ret;
 322                unsigned long count = 0;
 323
 324                ret = hfi1_user_sdma_process_request(
 325                        fd, (struct iovec *)(from->iov + done),
 326                        dim, &count);
 327                if (ret) {
 328                        reqs = ret;
 329                        break;
 330                }
 331                dim -= count;
 332                done += count;
 333                reqs++;
 334        }
 335
 336        return reqs;
 337}
 338
 339static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
 340{
 341        struct hfi1_filedata *fd = fp->private_data;
 342        struct hfi1_ctxtdata *uctxt = fd->uctxt;
 343        struct hfi1_devdata *dd;
 344        unsigned long flags;
 345        u64 token = vma->vm_pgoff << PAGE_SHIFT,
 346                memaddr = 0;
 347        void *memvirt = NULL;
 348        u8 subctxt, mapio = 0, vmf = 0, type;
 349        ssize_t memlen = 0;
 350        int ret = 0;
 351        u16 ctxt;
 352
 353        if (!is_valid_mmap(token) || !uctxt ||
 354            !(vma->vm_flags & VM_SHARED)) {
 355                ret = -EINVAL;
 356                goto done;
 357        }
 358        dd = uctxt->dd;
 359        ctxt = HFI1_MMAP_TOKEN_GET(CTXT, token);
 360        subctxt = HFI1_MMAP_TOKEN_GET(SUBCTXT, token);
 361        type = HFI1_MMAP_TOKEN_GET(TYPE, token);
 362        if (ctxt != uctxt->ctxt || subctxt != fd->subctxt) {
 363                ret = -EINVAL;
 364                goto done;
 365        }
 366
 367        flags = vma->vm_flags;
 368
 369        switch (type) {
 370        case PIO_BUFS:
 371        case PIO_BUFS_SOP:
 372                memaddr = ((dd->physaddr + TXE_PIO_SEND) +
 373                                /* chip pio base */
 374                           (uctxt->sc->hw_context * BIT(16))) +
 375                                /* 64K PIO space / ctxt */
 376                        (type == PIO_BUFS_SOP ?
 377                                (TXE_PIO_SIZE / 2) : 0); /* sop? */
 378                /*
 379                 * Map only the amount allocated to the context, not the
 380                 * entire available context's PIO space.
 381                 */
 382                memlen = PAGE_ALIGN(uctxt->sc->credits * PIO_BLOCK_SIZE);
 383                flags &= ~VM_MAYREAD;
 384                flags |= VM_DONTCOPY | VM_DONTEXPAND;
 385                vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
 386                mapio = 1;
 387                break;
 388        case PIO_CRED:
 389                if (flags & VM_WRITE) {
 390                        ret = -EPERM;
 391                        goto done;
 392                }
 393                /*
 394                 * The credit return location for this context could be on the
 395                 * second or third page allocated for credit returns (if number
 396                 * of enabled contexts > 64 and 128 respectively).
 397                 */
 398                memvirt = dd->cr_base[uctxt->numa_id].va;
 399                memaddr = virt_to_phys(memvirt) +
 400                        (((u64)uctxt->sc->hw_free -
 401                          (u64)dd->cr_base[uctxt->numa_id].va) & PAGE_MASK);
 402                memlen = PAGE_SIZE;
 403                flags &= ~VM_MAYWRITE;
 404                flags |= VM_DONTCOPY | VM_DONTEXPAND;
 405                /*
 406                 * The driver has already allocated memory for credit
 407                 * returns and programmed it into the chip. Has that
 408                 * memory been flagged as non-cached?
 409                 */
 410                /* vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); */
 411                mapio = 1;
 412                break;
 413        case RCV_HDRQ:
 414                memlen = uctxt->rcvhdrq_size;
 415                memvirt = uctxt->rcvhdrq;
 416                break;
 417        case RCV_EGRBUF: {
 418                unsigned long addr;
 419                int i;
 420                /*
 421                 * The RcvEgr buffer need to be handled differently
 422                 * as multiple non-contiguous pages need to be mapped
 423                 * into the user process.
 424                 */
 425                memlen = uctxt->egrbufs.size;
 426                if ((vma->vm_end - vma->vm_start) != memlen) {
 427                        dd_dev_err(dd, "Eager buffer map size invalid (%lu != %lu)\n",
 428                                   (vma->vm_end - vma->vm_start), memlen);
 429                        ret = -EINVAL;
 430                        goto done;
 431                }
 432                if (vma->vm_flags & VM_WRITE) {
 433                        ret = -EPERM;
 434                        goto done;
 435                }
 436                vma->vm_flags &= ~VM_MAYWRITE;
 437                addr = vma->vm_start;
 438                for (i = 0 ; i < uctxt->egrbufs.numbufs; i++) {
 439                        memlen = uctxt->egrbufs.buffers[i].len;
 440                        memvirt = uctxt->egrbufs.buffers[i].addr;
 441                        ret = remap_pfn_range(
 442                                vma, addr,
 443                                /*
 444                                 * virt_to_pfn() does the same, but
 445                                 * it's not available on x86_64
 446                                 * when CONFIG_MMU is enabled.
 447                                 */
 448                                PFN_DOWN(__pa(memvirt)),
 449                                memlen,
 450                                vma->vm_page_prot);
 451                        if (ret < 0)
 452                                goto done;
 453                        addr += memlen;
 454                }
 455                ret = 0;
 456                goto done;
 457        }
 458        case UREGS:
 459                /*
 460                 * Map only the page that contains this context's user
 461                 * registers.
 462                 */
 463                memaddr = (unsigned long)
 464                        (dd->physaddr + RXE_PER_CONTEXT_USER)
 465                        + (uctxt->ctxt * RXE_PER_CONTEXT_SIZE);
 466                /*
 467                 * TidFlow table is on the same page as the rest of the
 468                 * user registers.
 469                 */
 470                memlen = PAGE_SIZE;
 471                flags |= VM_DONTCOPY | VM_DONTEXPAND;
 472                vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 473                mapio = 1;
 474                break;
 475        case EVENTS:
 476                /*
 477                 * Use the page where this context's flags are. User level
 478                 * knows where it's own bitmap is within the page.
 479                 */
 480                memaddr = (unsigned long)
 481                        (dd->events + uctxt_offset(uctxt)) & PAGE_MASK;
 482                memlen = PAGE_SIZE;
 483                /*
 484                 * v3.7 removes VM_RESERVED but the effect is kept by
 485                 * using VM_IO.
 486                 */
 487                flags |= VM_IO | VM_DONTEXPAND;
 488                vmf = 1;
 489                break;
 490        case STATUS:
 491                if (flags & (unsigned long)(VM_WRITE | VM_EXEC)) {
 492                        ret = -EPERM;
 493                        goto done;
 494                }
 495                memaddr = kvirt_to_phys((void *)dd->status);
 496                memlen = PAGE_SIZE;
 497                flags |= VM_IO | VM_DONTEXPAND;
 498                break;
 499        case RTAIL:
 500                if (!HFI1_CAP_IS_USET(DMA_RTAIL)) {
 501                        /*
 502                         * If the memory allocation failed, the context alloc
 503                         * also would have failed, so we would never get here
 504                         */
 505                        ret = -EINVAL;
 506                        goto done;
 507                }
 508                if (flags & VM_WRITE) {
 509                        ret = -EPERM;
 510                        goto done;
 511                }
 512                memlen = PAGE_SIZE;
 513                memvirt = (void *)uctxt->rcvhdrtail_kvaddr;
 514                flags &= ~VM_MAYWRITE;
 515                break;
 516        case SUBCTXT_UREGS:
 517                memaddr = (u64)uctxt->subctxt_uregbase;
 518                memlen = PAGE_SIZE;
 519                flags |= VM_IO | VM_DONTEXPAND;
 520                vmf = 1;
 521                break;
 522        case SUBCTXT_RCV_HDRQ:
 523                memaddr = (u64)uctxt->subctxt_rcvhdr_base;
 524                memlen = uctxt->rcvhdrq_size * uctxt->subctxt_cnt;
 525                flags |= VM_IO | VM_DONTEXPAND;
 526                vmf = 1;
 527                break;
 528        case SUBCTXT_EGRBUF:
 529                memaddr = (u64)uctxt->subctxt_rcvegrbuf;
 530                memlen = uctxt->egrbufs.size * uctxt->subctxt_cnt;
 531                flags |= VM_IO | VM_DONTEXPAND;
 532                flags &= ~VM_MAYWRITE;
 533                vmf = 1;
 534                break;
 535        case SDMA_COMP: {
 536                struct hfi1_user_sdma_comp_q *cq = fd->cq;
 537
 538                if (!cq) {
 539                        ret = -EFAULT;
 540                        goto done;
 541                }
 542                memaddr = (u64)cq->comps;
 543                memlen = PAGE_ALIGN(sizeof(*cq->comps) * cq->nentries);
 544                flags |= VM_IO | VM_DONTEXPAND;
 545                vmf = 1;
 546                break;
 547        }
 548        default:
 549                ret = -EINVAL;
 550                break;
 551        }
 552
 553        if ((vma->vm_end - vma->vm_start) != memlen) {
 554                hfi1_cdbg(PROC, "%u:%u Memory size mismatch %lu:%lu",
 555                          uctxt->ctxt, fd->subctxt,
 556                          (vma->vm_end - vma->vm_start), memlen);
 557                ret = -EINVAL;
 558                goto done;
 559        }
 560
 561        vma->vm_flags = flags;
 562        hfi1_cdbg(PROC,
 563                  "%u:%u type:%u io/vf:%d/%d, addr:0x%llx, len:%lu(%lu), flags:0x%lx\n",
 564                    ctxt, subctxt, type, mapio, vmf, memaddr, memlen,
 565                    vma->vm_end - vma->vm_start, vma->vm_flags);
 566        if (vmf) {
 567                vma->vm_pgoff = PFN_DOWN(memaddr);
 568                vma->vm_ops = &vm_ops;
 569                ret = 0;
 570        } else if (mapio) {
 571                ret = io_remap_pfn_range(vma, vma->vm_start,
 572                                         PFN_DOWN(memaddr),
 573                                         memlen,
 574                                         vma->vm_page_prot);
 575        } else if (memvirt) {
 576                ret = remap_pfn_range(vma, vma->vm_start,
 577                                      PFN_DOWN(__pa(memvirt)),
 578                                      memlen,
 579                                      vma->vm_page_prot);
 580        } else {
 581                ret = remap_pfn_range(vma, vma->vm_start,
 582                                      PFN_DOWN(memaddr),
 583                                      memlen,
 584                                      vma->vm_page_prot);
 585        }
 586done:
 587        return ret;
 588}
 589
 590/*
 591 * Local (non-chip) user memory is not mapped right away but as it is
 592 * accessed by the user-level code.
 593 */
 594static int vma_fault(struct vm_fault *vmf)
 595{
 596        struct page *page;
 597
 598        page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT));
 599        if (!page)
 600                return VM_FAULT_SIGBUS;
 601
 602        get_page(page);
 603        vmf->page = page;
 604
 605        return 0;
 606}
 607
 608static __poll_t hfi1_poll(struct file *fp, struct poll_table_struct *pt)
 609{
 610        struct hfi1_ctxtdata *uctxt;
 611        __poll_t pollflag;
 612
 613        uctxt = ((struct hfi1_filedata *)fp->private_data)->uctxt;
 614        if (!uctxt)
 615                pollflag = EPOLLERR;
 616        else if (uctxt->poll_type == HFI1_POLL_TYPE_URGENT)
 617                pollflag = poll_urgent(fp, pt);
 618        else  if (uctxt->poll_type == HFI1_POLL_TYPE_ANYRCV)
 619                pollflag = poll_next(fp, pt);
 620        else /* invalid */
 621                pollflag = EPOLLERR;
 622
 623        return pollflag;
 624}
 625
 626static int hfi1_file_close(struct inode *inode, struct file *fp)
 627{
 628        struct hfi1_filedata *fdata = fp->private_data;
 629        struct hfi1_ctxtdata *uctxt = fdata->uctxt;
 630        struct hfi1_devdata *dd = container_of(inode->i_cdev,
 631                                               struct hfi1_devdata,
 632                                               user_cdev);
 633        unsigned long flags, *ev;
 634
 635        fp->private_data = NULL;
 636
 637        if (!uctxt)
 638                goto done;
 639
 640        hfi1_cdbg(PROC, "closing ctxt %u:%u", uctxt->ctxt, fdata->subctxt);
 641
 642        flush_wc();
 643        /* drain user sdma queue */
 644        hfi1_user_sdma_free_queues(fdata, uctxt);
 645
 646        /* release the cpu */
 647        hfi1_put_proc_affinity(fdata->rec_cpu_num);
 648
 649        /* clean up rcv side */
 650        hfi1_user_exp_rcv_free(fdata);
 651
 652        /*
 653         * fdata->uctxt is used in the above cleanup.  It is not ready to be
 654         * removed until here.
 655         */
 656        fdata->uctxt = NULL;
 657        hfi1_rcd_put(uctxt);
 658
 659        /*
 660         * Clear any left over, unhandled events so the next process that
 661         * gets this context doesn't get confused.
 662         */
 663        ev = dd->events + uctxt_offset(uctxt) + fdata->subctxt;
 664        *ev = 0;
 665
 666        spin_lock_irqsave(&dd->uctxt_lock, flags);
 667        __clear_bit(fdata->subctxt, uctxt->in_use_ctxts);
 668        if (!bitmap_empty(uctxt->in_use_ctxts, HFI1_MAX_SHARED_CTXTS)) {
 669                spin_unlock_irqrestore(&dd->uctxt_lock, flags);
 670                goto done;
 671        }
 672        spin_unlock_irqrestore(&dd->uctxt_lock, flags);
 673
 674        /*
 675         * Disable receive context and interrupt available, reset all
 676         * RcvCtxtCtrl bits to default values.
 677         */
 678        hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS |
 679                     HFI1_RCVCTRL_TIDFLOW_DIS |
 680                     HFI1_RCVCTRL_INTRAVAIL_DIS |
 681                     HFI1_RCVCTRL_TAILUPD_DIS |
 682                     HFI1_RCVCTRL_ONE_PKT_EGR_DIS |
 683                     HFI1_RCVCTRL_NO_RHQ_DROP_DIS |
 684                     HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt);
 685        /* Clear the context's J_KEY */
 686        hfi1_clear_ctxt_jkey(dd, uctxt);
 687        /*
 688         * If a send context is allocated, reset context integrity
 689         * checks to default and disable the send context.
 690         */
 691        if (uctxt->sc) {
 692                set_pio_integrity(uctxt->sc);
 693                sc_disable(uctxt->sc);
 694        }
 695
 696        hfi1_free_ctxt_rcv_groups(uctxt);
 697        hfi1_clear_ctxt_pkey(dd, uctxt);
 698
 699        uctxt->event_flags = 0;
 700
 701        deallocate_ctxt(uctxt);
 702done:
 703        mmdrop(fdata->mm);
 704        kobject_put(&dd->kobj);
 705
 706        if (atomic_dec_and_test(&dd->user_refcount))
 707                complete(&dd->user_comp);
 708
 709        kfree(fdata);
 710        return 0;
 711}
 712
 713/*
 714 * Convert kernel *virtual* addresses to physical addresses.
 715 * This is used to vmalloc'ed addresses.
 716 */
 717static u64 kvirt_to_phys(void *addr)
 718{
 719        struct page *page;
 720        u64 paddr = 0;
 721
 722        page = vmalloc_to_page(addr);
 723        if (page)
 724                paddr = page_to_pfn(page) << PAGE_SHIFT;
 725
 726        return paddr;
 727}
 728
 729/**
 730 * complete_subctxt
 731 * @fd: valid filedata pointer
 732 *
 733 * Sub-context info can only be set up after the base context
 734 * has been completed.  This is indicated by the clearing of the
 735 * HFI1_CTXT_BASE_UINIT bit.
 736 *
 737 * Wait for the bit to be cleared, and then complete the subcontext
 738 * initialization.
 739 *
 740 */
 741static int complete_subctxt(struct hfi1_filedata *fd)
 742{
 743        int ret;
 744        unsigned long flags;
 745
 746        /*
 747         * sub-context info can only be set up after the base context
 748         * has been completed.
 749         */
 750        ret = wait_event_interruptible(
 751                fd->uctxt->wait,
 752                !test_bit(HFI1_CTXT_BASE_UNINIT, &fd->uctxt->event_flags));
 753
 754        if (test_bit(HFI1_CTXT_BASE_FAILED, &fd->uctxt->event_flags))
 755                ret = -ENOMEM;
 756
 757        /* Finish the sub-context init */
 758        if (!ret) {
 759                fd->rec_cpu_num = hfi1_get_proc_affinity(fd->uctxt->numa_id);
 760                ret = init_user_ctxt(fd, fd->uctxt);
 761        }
 762
 763        if (ret) {
 764                spin_lock_irqsave(&fd->dd->uctxt_lock, flags);
 765                __clear_bit(fd->subctxt, fd->uctxt->in_use_ctxts);
 766                spin_unlock_irqrestore(&fd->dd->uctxt_lock, flags);
 767                hfi1_rcd_put(fd->uctxt);
 768                fd->uctxt = NULL;
 769        }
 770
 771        return ret;
 772}
 773
 774static int assign_ctxt(struct hfi1_filedata *fd, unsigned long arg, u32 len)
 775{
 776        int ret;
 777        unsigned int swmajor;
 778        struct hfi1_ctxtdata *uctxt = NULL;
 779        struct hfi1_user_info uinfo;
 780
 781        if (fd->uctxt)
 782                return -EINVAL;
 783
 784        if (sizeof(uinfo) != len)
 785                return -EINVAL;
 786
 787        if (copy_from_user(&uinfo, (void __user *)arg, sizeof(uinfo)))
 788                return -EFAULT;
 789
 790        swmajor = uinfo.userversion >> 16;
 791        if (swmajor != HFI1_USER_SWMAJOR)
 792                return -ENODEV;
 793
 794        if (uinfo.subctxt_cnt > HFI1_MAX_SHARED_CTXTS)
 795                return -EINVAL;
 796
 797        /*
 798         * Acquire the mutex to protect against multiple creations of what
 799         * could be a shared base context.
 800         */
 801        mutex_lock(&hfi1_mutex);
 802        /*
 803         * Get a sub context if available  (fd->uctxt will be set).
 804         * ret < 0 error, 0 no context, 1 sub-context found
 805         */
 806        ret = find_sub_ctxt(fd, &uinfo);
 807
 808        /*
 809         * Allocate a base context if context sharing is not required or a
 810         * sub context wasn't found.
 811         */
 812        if (!ret)
 813                ret = allocate_ctxt(fd, fd->dd, &uinfo, &uctxt);
 814
 815        mutex_unlock(&hfi1_mutex);
 816
 817        /* Depending on the context type, finish the appropriate init */
 818        switch (ret) {
 819        case 0:
 820                ret = setup_base_ctxt(fd, uctxt);
 821                if (ret)
 822                        deallocate_ctxt(uctxt);
 823                break;
 824        case 1:
 825                ret = complete_subctxt(fd);
 826                break;
 827        default:
 828                break;
 829        }
 830
 831        return ret;
 832}
 833
 834/**
 835 * match_ctxt
 836 * @fd: valid filedata pointer
 837 * @uinfo: user info to compare base context with
 838 * @uctxt: context to compare uinfo to.
 839 *
 840 * Compare the given context with the given information to see if it
 841 * can be used for a sub context.
 842 */
 843static int match_ctxt(struct hfi1_filedata *fd,
 844                      const struct hfi1_user_info *uinfo,
 845                      struct hfi1_ctxtdata *uctxt)
 846{
 847        struct hfi1_devdata *dd = fd->dd;
 848        unsigned long flags;
 849        u16 subctxt;
 850
 851        /* Skip dynamically allocated kernel contexts */
 852        if (uctxt->sc && (uctxt->sc->type == SC_KERNEL))
 853                return 0;
 854
 855        /* Skip ctxt if it doesn't match the requested one */
 856        if (memcmp(uctxt->uuid, uinfo->uuid, sizeof(uctxt->uuid)) ||
 857            uctxt->jkey != generate_jkey(current_uid()) ||
 858            uctxt->subctxt_id != uinfo->subctxt_id ||
 859            uctxt->subctxt_cnt != uinfo->subctxt_cnt)
 860                return 0;
 861
 862        /* Verify the sharing process matches the base */
 863        if (uctxt->userversion != uinfo->userversion)
 864                return -EINVAL;
 865
 866        /* Find an unused sub context */
 867        spin_lock_irqsave(&dd->uctxt_lock, flags);
 868        if (bitmap_empty(uctxt->in_use_ctxts, HFI1_MAX_SHARED_CTXTS)) {
 869                /* context is being closed, do not use */
 870                spin_unlock_irqrestore(&dd->uctxt_lock, flags);
 871                return 0;
 872        }
 873
 874        subctxt = find_first_zero_bit(uctxt->in_use_ctxts,
 875                                      HFI1_MAX_SHARED_CTXTS);
 876        if (subctxt >= uctxt->subctxt_cnt) {
 877                spin_unlock_irqrestore(&dd->uctxt_lock, flags);
 878                return -EBUSY;
 879        }
 880
 881        fd->subctxt = subctxt;
 882        __set_bit(fd->subctxt, uctxt->in_use_ctxts);
 883        spin_unlock_irqrestore(&dd->uctxt_lock, flags);
 884
 885        fd->uctxt = uctxt;
 886        hfi1_rcd_get(uctxt);
 887
 888        return 1;
 889}
 890
 891/**
 892 * find_sub_ctxt
 893 * @fd: valid filedata pointer
 894 * @uinfo: matching info to use to find a possible context to share.
 895 *
 896 * The hfi1_mutex must be held when this function is called.  It is
 897 * necessary to ensure serialized creation of shared contexts.
 898 *
 899 * Return:
 900 *    0      No sub-context found
 901 *    1      Subcontext found and allocated
 902 *    errno  EINVAL (incorrect parameters)
 903 *           EBUSY (all sub contexts in use)
 904 */
 905static int find_sub_ctxt(struct hfi1_filedata *fd,
 906                         const struct hfi1_user_info *uinfo)
 907{
 908        struct hfi1_ctxtdata *uctxt;
 909        struct hfi1_devdata *dd = fd->dd;
 910        u16 i;
 911        int ret;
 912
 913        if (!uinfo->subctxt_cnt)
 914                return 0;
 915
 916        for (i = dd->first_dyn_alloc_ctxt; i < dd->num_rcv_contexts; i++) {
 917                uctxt = hfi1_rcd_get_by_index(dd, i);
 918                if (uctxt) {
 919                        ret = match_ctxt(fd, uinfo, uctxt);
 920                        hfi1_rcd_put(uctxt);
 921                        /* value of != 0 will return */
 922                        if (ret)
 923                                return ret;
 924                }
 925        }
 926
 927        return 0;
 928}
 929
 930static int allocate_ctxt(struct hfi1_filedata *fd, struct hfi1_devdata *dd,
 931                         struct hfi1_user_info *uinfo,
 932                         struct hfi1_ctxtdata **rcd)
 933{
 934        struct hfi1_ctxtdata *uctxt;
 935        int ret, numa;
 936
 937        if (dd->flags & HFI1_FROZEN) {
 938                /*
 939                 * Pick an error that is unique from all other errors
 940                 * that are returned so the user process knows that
 941                 * it tried to allocate while the SPC was frozen.  It
 942                 * it should be able to retry with success in a short
 943                 * while.
 944                 */
 945                return -EIO;
 946        }
 947
 948        if (!dd->freectxts)
 949                return -EBUSY;
 950
 951        /*
 952         * If we don't have a NUMA node requested, preference is towards
 953         * device NUMA node.
 954         */
 955        fd->rec_cpu_num = hfi1_get_proc_affinity(dd->node);
 956        if (fd->rec_cpu_num != -1)
 957                numa = cpu_to_node(fd->rec_cpu_num);
 958        else
 959                numa = numa_node_id();
 960        ret = hfi1_create_ctxtdata(dd->pport, numa, &uctxt);
 961        if (ret < 0) {
 962                dd_dev_err(dd, "user ctxtdata allocation failed\n");
 963                return ret;
 964        }
 965        hfi1_cdbg(PROC, "[%u:%u] pid %u assigned to CPU %d (NUMA %u)",
 966                  uctxt->ctxt, fd->subctxt, current->pid, fd->rec_cpu_num,
 967                  uctxt->numa_id);
 968
 969        /*
 970         * Allocate and enable a PIO send context.
 971         */
 972        uctxt->sc = sc_alloc(dd, SC_USER, uctxt->rcvhdrqentsize, dd->node);
 973        if (!uctxt->sc) {
 974                ret = -ENOMEM;
 975                goto ctxdata_free;
 976        }
 977        hfi1_cdbg(PROC, "allocated send context %u(%u)\n", uctxt->sc->sw_index,
 978                  uctxt->sc->hw_context);
 979        ret = sc_enable(uctxt->sc);
 980        if (ret)
 981                goto ctxdata_free;
 982
 983        /*
 984         * Setup sub context information if the user-level has requested
 985         * sub contexts.
 986         * This has to be done here so the rest of the sub-contexts find the
 987         * proper base context.
 988         */
 989        if (uinfo->subctxt_cnt)
 990                init_subctxts(uctxt, uinfo);
 991        uctxt->userversion = uinfo->userversion;
 992        uctxt->flags = hfi1_cap_mask; /* save current flag state */
 993        init_waitqueue_head(&uctxt->wait);
 994        strlcpy(uctxt->comm, current->comm, sizeof(uctxt->comm));
 995        memcpy(uctxt->uuid, uinfo->uuid, sizeof(uctxt->uuid));
 996        uctxt->jkey = generate_jkey(current_uid());
 997        hfi1_stats.sps_ctxts++;
 998        /*
 999         * Disable ASPM when there are open user/PSM contexts to avoid
1000         * issues with ASPM L1 exit latency
1001         */
1002        if (dd->freectxts-- == dd->num_user_contexts)
1003                aspm_disable_all(dd);
1004
1005        *rcd = uctxt;
1006
1007        return 0;
1008
1009ctxdata_free:
1010        hfi1_free_ctxt(uctxt);
1011        return ret;
1012}
1013
1014static void deallocate_ctxt(struct hfi1_ctxtdata *uctxt)
1015{
1016        mutex_lock(&hfi1_mutex);
1017        hfi1_stats.sps_ctxts--;
1018        if (++uctxt->dd->freectxts == uctxt->dd->num_user_contexts)
1019                aspm_enable_all(uctxt->dd);
1020        mutex_unlock(&hfi1_mutex);
1021
1022        hfi1_free_ctxt(uctxt);
1023}
1024
1025static void init_subctxts(struct hfi1_ctxtdata *uctxt,
1026                          const struct hfi1_user_info *uinfo)
1027{
1028        uctxt->subctxt_cnt = uinfo->subctxt_cnt;
1029        uctxt->subctxt_id = uinfo->subctxt_id;
1030        set_bit(HFI1_CTXT_BASE_UNINIT, &uctxt->event_flags);
1031}
1032
1033static int setup_subctxt(struct hfi1_ctxtdata *uctxt)
1034{
1035        int ret = 0;
1036        u16 num_subctxts = uctxt->subctxt_cnt;
1037
1038        uctxt->subctxt_uregbase = vmalloc_user(PAGE_SIZE);
1039        if (!uctxt->subctxt_uregbase)
1040                return -ENOMEM;
1041
1042        /* We can take the size of the RcvHdr Queue from the master */
1043        uctxt->subctxt_rcvhdr_base = vmalloc_user(uctxt->rcvhdrq_size *
1044                                                  num_subctxts);
1045        if (!uctxt->subctxt_rcvhdr_base) {
1046                ret = -ENOMEM;
1047                goto bail_ureg;
1048        }
1049
1050        uctxt->subctxt_rcvegrbuf = vmalloc_user(uctxt->egrbufs.size *
1051                                                num_subctxts);
1052        if (!uctxt->subctxt_rcvegrbuf) {
1053                ret = -ENOMEM;
1054                goto bail_rhdr;
1055        }
1056
1057        return 0;
1058
1059bail_rhdr:
1060        vfree(uctxt->subctxt_rcvhdr_base);
1061        uctxt->subctxt_rcvhdr_base = NULL;
1062bail_ureg:
1063        vfree(uctxt->subctxt_uregbase);
1064        uctxt->subctxt_uregbase = NULL;
1065
1066        return ret;
1067}
1068
1069static void user_init(struct hfi1_ctxtdata *uctxt)
1070{
1071        unsigned int rcvctrl_ops = 0;
1072
1073        /* initialize poll variables... */
1074        uctxt->urgent = 0;
1075        uctxt->urgent_poll = 0;
1076
1077        /*
1078         * Now enable the ctxt for receive.
1079         * For chips that are set to DMA the tail register to memory
1080         * when they change (and when the update bit transitions from
1081         * 0 to 1.  So for those chips, we turn it off and then back on.
1082         * This will (very briefly) affect any other open ctxts, but the
1083         * duration is very short, and therefore isn't an issue.  We
1084         * explicitly set the in-memory tail copy to 0 beforehand, so we
1085         * don't have to wait to be sure the DMA update has happened
1086         * (chip resets head/tail to 0 on transition to enable).
1087         */
1088        if (uctxt->rcvhdrtail_kvaddr)
1089                clear_rcvhdrtail(uctxt);
1090
1091        /* Setup J_KEY before enabling the context */
1092        hfi1_set_ctxt_jkey(uctxt->dd, uctxt, uctxt->jkey);
1093
1094        rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB;
1095        if (HFI1_CAP_UGET_MASK(uctxt->flags, HDRSUPP))
1096                rcvctrl_ops |= HFI1_RCVCTRL_TIDFLOW_ENB;
1097        /*
1098         * Ignore the bit in the flags for now until proper
1099         * support for multiple packet per rcv array entry is
1100         * added.
1101         */
1102        if (!HFI1_CAP_UGET_MASK(uctxt->flags, MULTI_PKT_EGR))
1103                rcvctrl_ops |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB;
1104        if (HFI1_CAP_UGET_MASK(uctxt->flags, NODROP_EGR_FULL))
1105                rcvctrl_ops |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
1106        if (HFI1_CAP_UGET_MASK(uctxt->flags, NODROP_RHQ_FULL))
1107                rcvctrl_ops |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
1108        /*
1109         * The RcvCtxtCtrl.TailUpd bit has to be explicitly written.
1110         * We can't rely on the correct value to be set from prior
1111         * uses of the chip or ctxt. Therefore, add the rcvctrl op
1112         * for both cases.
1113         */
1114        if (HFI1_CAP_UGET_MASK(uctxt->flags, DMA_RTAIL))
1115                rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB;
1116        else
1117                rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_DIS;
1118        hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt);
1119}
1120
1121static int get_ctxt_info(struct hfi1_filedata *fd, unsigned long arg, u32 len)
1122{
1123        struct hfi1_ctxt_info cinfo;
1124        struct hfi1_ctxtdata *uctxt = fd->uctxt;
1125
1126        if (sizeof(cinfo) != len)
1127                return -EINVAL;
1128
1129        memset(&cinfo, 0, sizeof(cinfo));
1130        cinfo.runtime_flags = (((uctxt->flags >> HFI1_CAP_MISC_SHIFT) &
1131                                HFI1_CAP_MISC_MASK) << HFI1_CAP_USER_SHIFT) |
1132                        HFI1_CAP_UGET_MASK(uctxt->flags, MASK) |
1133                        HFI1_CAP_KGET_MASK(uctxt->flags, K2U);
1134        /* adjust flag if this fd is not able to cache */
1135        if (!fd->handler)
1136                cinfo.runtime_flags |= HFI1_CAP_TID_UNMAP; /* no caching */
1137
1138        cinfo.num_active = hfi1_count_active_units();
1139        cinfo.unit = uctxt->dd->unit;
1140        cinfo.ctxt = uctxt->ctxt;
1141        cinfo.subctxt = fd->subctxt;
1142        cinfo.rcvtids = roundup(uctxt->egrbufs.alloced,
1143                                uctxt->dd->rcv_entries.group_size) +
1144                uctxt->expected_count;
1145        cinfo.credits = uctxt->sc->credits;
1146        cinfo.numa_node = uctxt->numa_id;
1147        cinfo.rec_cpu = fd->rec_cpu_num;
1148        cinfo.send_ctxt = uctxt->sc->hw_context;
1149
1150        cinfo.egrtids = uctxt->egrbufs.alloced;
1151        cinfo.rcvhdrq_cnt = uctxt->rcvhdrq_cnt;
1152        cinfo.rcvhdrq_entsize = uctxt->rcvhdrqentsize << 2;
1153        cinfo.sdma_ring_size = fd->cq->nentries;
1154        cinfo.rcvegr_size = uctxt->egrbufs.rcvtid_size;
1155
1156        trace_hfi1_ctxt_info(uctxt->dd, uctxt->ctxt, fd->subctxt, &cinfo);
1157        if (copy_to_user((void __user *)arg, &cinfo, len))
1158                return -EFAULT;
1159
1160        return 0;
1161}
1162
1163static int init_user_ctxt(struct hfi1_filedata *fd,
1164                          struct hfi1_ctxtdata *uctxt)
1165{
1166        int ret;
1167
1168        ret = hfi1_user_sdma_alloc_queues(uctxt, fd);
1169        if (ret)
1170                return ret;
1171
1172        ret = hfi1_user_exp_rcv_init(fd, uctxt);
1173        if (ret)
1174                hfi1_user_sdma_free_queues(fd, uctxt);
1175
1176        return ret;
1177}
1178
1179static int setup_base_ctxt(struct hfi1_filedata *fd,
1180                           struct hfi1_ctxtdata *uctxt)
1181{
1182        struct hfi1_devdata *dd = uctxt->dd;
1183        int ret = 0;
1184
1185        hfi1_init_ctxt(uctxt->sc);
1186
1187        /* Now allocate the RcvHdr queue and eager buffers. */
1188        ret = hfi1_create_rcvhdrq(dd, uctxt);
1189        if (ret)
1190                goto done;
1191
1192        ret = hfi1_setup_eagerbufs(uctxt);
1193        if (ret)
1194                goto done;
1195
1196        /* If sub-contexts are enabled, do the appropriate setup */
1197        if (uctxt->subctxt_cnt)
1198                ret = setup_subctxt(uctxt);
1199        if (ret)
1200                goto done;
1201
1202        ret = hfi1_alloc_ctxt_rcv_groups(uctxt);
1203        if (ret)
1204                goto done;
1205
1206        ret = init_user_ctxt(fd, uctxt);
1207        if (ret)
1208                goto done;
1209
1210        user_init(uctxt);
1211
1212        /* Now that the context is set up, the fd can get a reference. */
1213        fd->uctxt = uctxt;
1214        hfi1_rcd_get(uctxt);
1215
1216done:
1217        if (uctxt->subctxt_cnt) {
1218                /*
1219                 * On error, set the failed bit so sub-contexts will clean up
1220                 * correctly.
1221                 */
1222                if (ret)
1223                        set_bit(HFI1_CTXT_BASE_FAILED, &uctxt->event_flags);
1224
1225                /*
1226                 * Base context is done (successfully or not), notify anybody
1227                 * using a sub-context that is waiting for this completion.
1228                 */
1229                clear_bit(HFI1_CTXT_BASE_UNINIT, &uctxt->event_flags);
1230                wake_up(&uctxt->wait);
1231        }
1232
1233        return ret;
1234}
1235
1236static int get_base_info(struct hfi1_filedata *fd, unsigned long arg, u32 len)
1237{
1238        struct hfi1_base_info binfo;
1239        struct hfi1_ctxtdata *uctxt = fd->uctxt;
1240        struct hfi1_devdata *dd = uctxt->dd;
1241        unsigned offset;
1242
1243        trace_hfi1_uctxtdata(uctxt->dd, uctxt, fd->subctxt);
1244
1245        if (sizeof(binfo) != len)
1246                return -EINVAL;
1247
1248        memset(&binfo, 0, sizeof(binfo));
1249        binfo.hw_version = dd->revision;
1250        binfo.sw_version = HFI1_KERN_SWVERSION;
1251        binfo.bthqp = kdeth_qp;
1252        binfo.jkey = uctxt->jkey;
1253        /*
1254         * If more than 64 contexts are enabled the allocated credit
1255         * return will span two or three contiguous pages. Since we only
1256         * map the page containing the context's credit return address,
1257         * we need to calculate the offset in the proper page.
1258         */
1259        offset = ((u64)uctxt->sc->hw_free -
1260                  (u64)dd->cr_base[uctxt->numa_id].va) % PAGE_SIZE;
1261        binfo.sc_credits_addr = HFI1_MMAP_TOKEN(PIO_CRED, uctxt->ctxt,
1262                                                fd->subctxt, offset);
1263        binfo.pio_bufbase = HFI1_MMAP_TOKEN(PIO_BUFS, uctxt->ctxt,
1264                                            fd->subctxt,
1265                                            uctxt->sc->base_addr);
1266        binfo.pio_bufbase_sop = HFI1_MMAP_TOKEN(PIO_BUFS_SOP,
1267                                                uctxt->ctxt,
1268                                                fd->subctxt,
1269                                                uctxt->sc->base_addr);
1270        binfo.rcvhdr_bufbase = HFI1_MMAP_TOKEN(RCV_HDRQ, uctxt->ctxt,
1271                                               fd->subctxt,
1272                                               uctxt->rcvhdrq);
1273        binfo.rcvegr_bufbase = HFI1_MMAP_TOKEN(RCV_EGRBUF, uctxt->ctxt,
1274                                               fd->subctxt,
1275                                               uctxt->egrbufs.rcvtids[0].dma);
1276        binfo.sdma_comp_bufbase = HFI1_MMAP_TOKEN(SDMA_COMP, uctxt->ctxt,
1277                                                  fd->subctxt, 0);
1278        /*
1279         * user regs are at
1280         * (RXE_PER_CONTEXT_USER + (ctxt * RXE_PER_CONTEXT_SIZE))
1281         */
1282        binfo.user_regbase = HFI1_MMAP_TOKEN(UREGS, uctxt->ctxt,
1283                                             fd->subctxt, 0);
1284        offset = offset_in_page((uctxt_offset(uctxt) + fd->subctxt) *
1285                                sizeof(*dd->events));
1286        binfo.events_bufbase = HFI1_MMAP_TOKEN(EVENTS, uctxt->ctxt,
1287                                               fd->subctxt,
1288                                               offset);
1289        binfo.status_bufbase = HFI1_MMAP_TOKEN(STATUS, uctxt->ctxt,
1290                                               fd->subctxt,
1291                                               dd->status);
1292        if (HFI1_CAP_IS_USET(DMA_RTAIL))
1293                binfo.rcvhdrtail_base = HFI1_MMAP_TOKEN(RTAIL, uctxt->ctxt,
1294                                                        fd->subctxt, 0);
1295        if (uctxt->subctxt_cnt) {
1296                binfo.subctxt_uregbase = HFI1_MMAP_TOKEN(SUBCTXT_UREGS,
1297                                                         uctxt->ctxt,
1298                                                         fd->subctxt, 0);
1299                binfo.subctxt_rcvhdrbuf = HFI1_MMAP_TOKEN(SUBCTXT_RCV_HDRQ,
1300                                                          uctxt->ctxt,
1301                                                          fd->subctxt, 0);
1302                binfo.subctxt_rcvegrbuf = HFI1_MMAP_TOKEN(SUBCTXT_EGRBUF,
1303                                                          uctxt->ctxt,
1304                                                          fd->subctxt, 0);
1305        }
1306
1307        if (copy_to_user((void __user *)arg, &binfo, len))
1308                return -EFAULT;
1309
1310        return 0;
1311}
1312
1313/**
1314 * user_exp_rcv_setup - Set up the given tid rcv list
1315 * @fd: file data of the current driver instance
1316 * @arg: ioctl argumnent for user space information
1317 * @len: length of data structure associated with ioctl command
1318 *
1319 * Wrapper to validate ioctl information before doing _rcv_setup.
1320 *
1321 */
1322static int user_exp_rcv_setup(struct hfi1_filedata *fd, unsigned long arg,
1323                              u32 len)
1324{
1325        int ret;
1326        unsigned long addr;
1327        struct hfi1_tid_info tinfo;
1328
1329        if (sizeof(tinfo) != len)
1330                return -EINVAL;
1331
1332        if (copy_from_user(&tinfo, (void __user *)arg, (sizeof(tinfo))))
1333                return -EFAULT;
1334
1335        ret = hfi1_user_exp_rcv_setup(fd, &tinfo);
1336        if (!ret) {
1337                /*
1338                 * Copy the number of tidlist entries we used
1339                 * and the length of the buffer we registered.
1340                 */
1341                addr = arg + offsetof(struct hfi1_tid_info, tidcnt);
1342                if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
1343                                 sizeof(tinfo.tidcnt)))
1344                        return -EFAULT;
1345
1346                addr = arg + offsetof(struct hfi1_tid_info, length);
1347                if (copy_to_user((void __user *)addr, &tinfo.length,
1348                                 sizeof(tinfo.length)))
1349                        ret = -EFAULT;
1350        }
1351
1352        return ret;
1353}
1354
1355/**
1356 * user_exp_rcv_clear - Clear the given tid rcv list
1357 * @fd: file data of the current driver instance
1358 * @arg: ioctl argumnent for user space information
1359 * @len: length of data structure associated with ioctl command
1360 *
1361 * The hfi1_user_exp_rcv_clear() can be called from the error path.  Because
1362 * of this, we need to use this wrapper to copy the user space information
1363 * before doing the clear.
1364 */
1365static int user_exp_rcv_clear(struct hfi1_filedata *fd, unsigned long arg,
1366                              u32 len)
1367{
1368        int ret;
1369        unsigned long addr;
1370        struct hfi1_tid_info tinfo;
1371
1372        if (sizeof(tinfo) != len)
1373                return -EINVAL;
1374
1375        if (copy_from_user(&tinfo, (void __user *)arg, (sizeof(tinfo))))
1376                return -EFAULT;
1377
1378        ret = hfi1_user_exp_rcv_clear(fd, &tinfo);
1379        if (!ret) {
1380                addr = arg + offsetof(struct hfi1_tid_info, tidcnt);
1381                if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
1382                                 sizeof(tinfo.tidcnt)))
1383                        return -EFAULT;
1384        }
1385
1386        return ret;
1387}
1388
1389/**
1390 * user_exp_rcv_invalid - Invalidate the given tid rcv list
1391 * @fd: file data of the current driver instance
1392 * @arg: ioctl argumnent for user space information
1393 * @len: length of data structure associated with ioctl command
1394 *
1395 * Wrapper to validate ioctl information before doing _rcv_invalid.
1396 *
1397 */
1398static int user_exp_rcv_invalid(struct hfi1_filedata *fd, unsigned long arg,
1399                                u32 len)
1400{
1401        int ret;
1402        unsigned long addr;
1403        struct hfi1_tid_info tinfo;
1404
1405        if (sizeof(tinfo) != len)
1406                return -EINVAL;
1407
1408        if (!fd->invalid_tids)
1409                return -EINVAL;
1410
1411        if (copy_from_user(&tinfo, (void __user *)arg, (sizeof(tinfo))))
1412                return -EFAULT;
1413
1414        ret = hfi1_user_exp_rcv_invalid(fd, &tinfo);
1415        if (ret)
1416                return ret;
1417
1418        addr = arg + offsetof(struct hfi1_tid_info, tidcnt);
1419        if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
1420                         sizeof(tinfo.tidcnt)))
1421                ret = -EFAULT;
1422
1423        return ret;
1424}
1425
1426static __poll_t poll_urgent(struct file *fp,
1427                                struct poll_table_struct *pt)
1428{
1429        struct hfi1_filedata *fd = fp->private_data;
1430        struct hfi1_ctxtdata *uctxt = fd->uctxt;
1431        struct hfi1_devdata *dd = uctxt->dd;
1432        __poll_t pollflag;
1433
1434        poll_wait(fp, &uctxt->wait, pt);
1435
1436        spin_lock_irq(&dd->uctxt_lock);
1437        if (uctxt->urgent != uctxt->urgent_poll) {
1438                pollflag = EPOLLIN | EPOLLRDNORM;
1439                uctxt->urgent_poll = uctxt->urgent;
1440        } else {
1441                pollflag = 0;
1442                set_bit(HFI1_CTXT_WAITING_URG, &uctxt->event_flags);
1443        }
1444        spin_unlock_irq(&dd->uctxt_lock);
1445
1446        return pollflag;
1447}
1448
1449static __poll_t poll_next(struct file *fp,
1450                              struct poll_table_struct *pt)
1451{
1452        struct hfi1_filedata *fd = fp->private_data;
1453        struct hfi1_ctxtdata *uctxt = fd->uctxt;
1454        struct hfi1_devdata *dd = uctxt->dd;
1455        __poll_t pollflag;
1456
1457        poll_wait(fp, &uctxt->wait, pt);
1458
1459        spin_lock_irq(&dd->uctxt_lock);
1460        if (hdrqempty(uctxt)) {
1461                set_bit(HFI1_CTXT_WAITING_RCV, &uctxt->event_flags);
1462                hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_ENB, uctxt);
1463                pollflag = 0;
1464        } else {
1465                pollflag = EPOLLIN | EPOLLRDNORM;
1466        }
1467        spin_unlock_irq(&dd->uctxt_lock);
1468
1469        return pollflag;
1470}
1471
1472/*
1473 * Find all user contexts in use, and set the specified bit in their
1474 * event mask.
1475 * See also find_ctxt() for a similar use, that is specific to send buffers.
1476 */
1477int hfi1_set_uevent_bits(struct hfi1_pportdata *ppd, const int evtbit)
1478{
1479        struct hfi1_ctxtdata *uctxt;
1480        struct hfi1_devdata *dd = ppd->dd;
1481        u16 ctxt;
1482
1483        if (!dd->events)
1484                return -EINVAL;
1485
1486        for (ctxt = dd->first_dyn_alloc_ctxt; ctxt < dd->num_rcv_contexts;
1487             ctxt++) {
1488                uctxt = hfi1_rcd_get_by_index(dd, ctxt);
1489                if (uctxt) {
1490                        unsigned long *evs;
1491                        int i;
1492                        /*
1493                         * subctxt_cnt is 0 if not shared, so do base
1494                         * separately, first, then remaining subctxt, if any
1495                         */
1496                        evs = dd->events + uctxt_offset(uctxt);
1497                        set_bit(evtbit, evs);
1498                        for (i = 1; i < uctxt->subctxt_cnt; i++)
1499                                set_bit(evtbit, evs + i);
1500                        hfi1_rcd_put(uctxt);
1501                }
1502        }
1503
1504        return 0;
1505}
1506
1507/**
1508 * manage_rcvq - manage a context's receive queue
1509 * @uctxt: the context
1510 * @subctxt: the sub-context
1511 * @start_stop: action to carry out
1512 *
1513 * start_stop == 0 disables receive on the context, for use in queue
1514 * overflow conditions.  start_stop==1 re-enables, to be used to
1515 * re-init the software copy of the head register
1516 */
1517static int manage_rcvq(struct hfi1_ctxtdata *uctxt, u16 subctxt,
1518                       unsigned long arg)
1519{
1520        struct hfi1_devdata *dd = uctxt->dd;
1521        unsigned int rcvctrl_op;
1522        int start_stop;
1523
1524        if (subctxt)
1525                return 0;
1526
1527        if (get_user(start_stop, (int __user *)arg))
1528                return -EFAULT;
1529
1530        /* atomically clear receive enable ctxt. */
1531        if (start_stop) {
1532                /*
1533                 * On enable, force in-memory copy of the tail register to
1534                 * 0, so that protocol code doesn't have to worry about
1535                 * whether or not the chip has yet updated the in-memory
1536                 * copy or not on return from the system call. The chip
1537                 * always resets it's tail register back to 0 on a
1538                 * transition from disabled to enabled.
1539                 */
1540                if (uctxt->rcvhdrtail_kvaddr)
1541                        clear_rcvhdrtail(uctxt);
1542                rcvctrl_op = HFI1_RCVCTRL_CTXT_ENB;
1543        } else {
1544                rcvctrl_op = HFI1_RCVCTRL_CTXT_DIS;
1545        }
1546        hfi1_rcvctrl(dd, rcvctrl_op, uctxt);
1547        /* always; new head should be equal to new tail; see above */
1548
1549        return 0;
1550}
1551
1552/*
1553 * clear the event notifier events for this context.
1554 * User process then performs actions appropriate to bit having been
1555 * set, if desired, and checks again in future.
1556 */
1557static int user_event_ack(struct hfi1_ctxtdata *uctxt, u16 subctxt,
1558                          unsigned long arg)
1559{
1560        int i;
1561        struct hfi1_devdata *dd = uctxt->dd;
1562        unsigned long *evs;
1563        unsigned long events;
1564
1565        if (!dd->events)
1566                return 0;
1567
1568        if (get_user(events, (unsigned long __user *)arg))
1569                return -EFAULT;
1570
1571        evs = dd->events + uctxt_offset(uctxt) + subctxt;
1572
1573        for (i = 0; i <= _HFI1_MAX_EVENT_BIT; i++) {
1574                if (!test_bit(i, &events))
1575                        continue;
1576                clear_bit(i, evs);
1577        }
1578        return 0;
1579}
1580
1581static int set_ctxt_pkey(struct hfi1_ctxtdata *uctxt, unsigned long arg)
1582{
1583        int i;
1584        struct hfi1_pportdata *ppd = uctxt->ppd;
1585        struct hfi1_devdata *dd = uctxt->dd;
1586        u16 pkey;
1587
1588        if (!HFI1_CAP_IS_USET(PKEY_CHECK))
1589                return -EPERM;
1590
1591        if (get_user(pkey, (u16 __user *)arg))
1592                return -EFAULT;
1593
1594        if (pkey == LIM_MGMT_P_KEY || pkey == FULL_MGMT_P_KEY)
1595                return -EINVAL;
1596
1597        for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++)
1598                if (pkey == ppd->pkeys[i])
1599                        return hfi1_set_ctxt_pkey(dd, uctxt, pkey);
1600
1601        return -ENOENT;
1602}
1603
1604/**
1605 * ctxt_reset - Reset the user context
1606 * @uctxt: valid user context
1607 */
1608static int ctxt_reset(struct hfi1_ctxtdata *uctxt)
1609{
1610        struct send_context *sc;
1611        struct hfi1_devdata *dd;
1612        int ret = 0;
1613
1614        if (!uctxt || !uctxt->dd || !uctxt->sc)
1615                return -EINVAL;
1616
1617        /*
1618         * There is no protection here. User level has to guarantee that
1619         * no one will be writing to the send context while it is being
1620         * re-initialized.  If user level breaks that guarantee, it will
1621         * break it's own context and no one else's.
1622         */
1623        dd = uctxt->dd;
1624        sc = uctxt->sc;
1625
1626        /*
1627         * Wait until the interrupt handler has marked the context as
1628         * halted or frozen. Report error if we time out.
1629         */
1630        wait_event_interruptible_timeout(
1631                sc->halt_wait, (sc->flags & SCF_HALTED),
1632                msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT));
1633        if (!(sc->flags & SCF_HALTED))
1634                return -ENOLCK;
1635
1636        /*
1637         * If the send context was halted due to a Freeze, wait until the
1638         * device has been "unfrozen" before resetting the context.
1639         */
1640        if (sc->flags & SCF_FROZEN) {
1641                wait_event_interruptible_timeout(
1642                        dd->event_queue,
1643                        !(READ_ONCE(dd->flags) & HFI1_FROZEN),
1644                        msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT));
1645                if (dd->flags & HFI1_FROZEN)
1646                        return -ENOLCK;
1647
1648                if (dd->flags & HFI1_FORCED_FREEZE)
1649                        /*
1650                         * Don't allow context reset if we are into
1651                         * forced freeze
1652                         */
1653                        return -ENODEV;
1654
1655                sc_disable(sc);
1656                ret = sc_enable(sc);
1657                hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_ENB, uctxt);
1658        } else {
1659                ret = sc_restart(sc);
1660        }
1661        if (!ret)
1662                sc_return_credits(sc);
1663
1664        return ret;
1665}
1666
1667static void user_remove(struct hfi1_devdata *dd)
1668{
1669
1670        hfi1_cdev_cleanup(&dd->user_cdev, &dd->user_device);
1671}
1672
1673static int user_add(struct hfi1_devdata *dd)
1674{
1675        char name[10];
1676        int ret;
1677
1678        snprintf(name, sizeof(name), "%s_%d", class_name(), dd->unit);
1679        ret = hfi1_cdev_init(dd->unit, name, &hfi1_file_ops,
1680                             &dd->user_cdev, &dd->user_device,
1681                             true, &dd->kobj);
1682        if (ret)
1683                user_remove(dd);
1684
1685        return ret;
1686}
1687
1688/*
1689 * Create per-unit files in /dev
1690 */
1691int hfi1_device_create(struct hfi1_devdata *dd)
1692{
1693        return user_add(dd);
1694}
1695
1696/*
1697 * Remove per-unit files in /dev
1698 * void, core kernel returns no errors for this stuff
1699 */
1700void hfi1_device_remove(struct hfi1_devdata *dd)
1701{
1702        user_remove(dd);
1703}
1704