linux/drivers/infiniband/hw/hfi1/user_sdma.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
   2/*
   3 * Copyright(c) 2020 - Cornelis Networks, Inc.
   4 * Copyright(c) 2015 - 2018 Intel Corporation.
   5 */
   6
   7#include <linux/mm.h>
   8#include <linux/types.h>
   9#include <linux/device.h>
  10#include <linux/dmapool.h>
  11#include <linux/slab.h>
  12#include <linux/list.h>
  13#include <linux/highmem.h>
  14#include <linux/io.h>
  15#include <linux/uio.h>
  16#include <linux/rbtree.h>
  17#include <linux/spinlock.h>
  18#include <linux/delay.h>
  19#include <linux/kthread.h>
  20#include <linux/mmu_context.h>
  21#include <linux/module.h>
  22#include <linux/vmalloc.h>
  23#include <linux/string.h>
  24
  25#include "hfi.h"
  26#include "sdma.h"
  27#include "mmu_rb.h"
  28#include "user_sdma.h"
  29#include "verbs.h"  /* for the headers */
  30#include "common.h" /* for struct hfi1_tid_info */
  31#include "trace.h"
  32
  33static uint hfi1_sdma_comp_ring_size = 128;
  34module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO);
  35MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128");
  36
  37static unsigned initial_pkt_count = 8;
  38
  39static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts);
  40static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status);
  41static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq);
  42static void user_sdma_free_request(struct user_sdma_request *req, bool unpin);
  43static int pin_vector_pages(struct user_sdma_request *req,
  44                            struct user_sdma_iovec *iovec);
  45static void unpin_vector_pages(struct mm_struct *mm, struct page **pages,
  46                               unsigned start, unsigned npages);
  47static int check_header_template(struct user_sdma_request *req,
  48                                 struct hfi1_pkt_header *hdr, u32 lrhlen,
  49                                 u32 datalen);
  50static int set_txreq_header(struct user_sdma_request *req,
  51                            struct user_sdma_txreq *tx, u32 datalen);
  52static int set_txreq_header_ahg(struct user_sdma_request *req,
  53                                struct user_sdma_txreq *tx, u32 len);
  54static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
  55                                  struct hfi1_user_sdma_comp_q *cq,
  56                                  u16 idx, enum hfi1_sdma_comp_state state,
  57                                  int ret);
  58static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags);
  59static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len);
  60
  61static int defer_packet_queue(
  62        struct sdma_engine *sde,
  63        struct iowait_work *wait,
  64        struct sdma_txreq *txreq,
  65        uint seq,
  66        bool pkts_sent);
  67static void activate_packet_queue(struct iowait *wait, int reason);
  68static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
  69                           unsigned long len);
  70static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode);
  71static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode,
  72                         void *arg2, bool *stop);
  73static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode);
  74static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode);
  75
  76static struct mmu_rb_ops sdma_rb_ops = {
  77        .filter = sdma_rb_filter,
  78        .insert = sdma_rb_insert,
  79        .evict = sdma_rb_evict,
  80        .remove = sdma_rb_remove,
  81        .invalidate = sdma_rb_invalidate
  82};
  83
  84static int defer_packet_queue(
  85        struct sdma_engine *sde,
  86        struct iowait_work *wait,
  87        struct sdma_txreq *txreq,
  88        uint seq,
  89        bool pkts_sent)
  90{
  91        struct hfi1_user_sdma_pkt_q *pq =
  92                container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy);
  93
  94        write_seqlock(&sde->waitlock);
  95        trace_hfi1_usdma_defer(pq, sde, &pq->busy);
  96        if (sdma_progress(sde, seq, txreq))
  97                goto eagain;
  98        /*
  99         * We are assuming that if the list is enqueued somewhere, it
 100         * is to the dmawait list since that is the only place where
 101         * it is supposed to be enqueued.
 102         */
 103        xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
 104        if (list_empty(&pq->busy.list)) {
 105                pq->busy.lock = &sde->waitlock;
 106                iowait_get_priority(&pq->busy);
 107                iowait_queue(pkts_sent, &pq->busy, &sde->dmawait);
 108        }
 109        write_sequnlock(&sde->waitlock);
 110        return -EBUSY;
 111eagain:
 112        write_sequnlock(&sde->waitlock);
 113        return -EAGAIN;
 114}
 115
 116static void activate_packet_queue(struct iowait *wait, int reason)
 117{
 118        struct hfi1_user_sdma_pkt_q *pq =
 119                container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
 120
 121        trace_hfi1_usdma_activate(pq, wait, reason);
 122        xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
 123        wake_up(&wait->wait_dma);
 124};
 125
 126int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt,
 127                                struct hfi1_filedata *fd)
 128{
 129        int ret = -ENOMEM;
 130        char buf[64];
 131        struct hfi1_devdata *dd;
 132        struct hfi1_user_sdma_comp_q *cq;
 133        struct hfi1_user_sdma_pkt_q *pq;
 134
 135        if (!uctxt || !fd)
 136                return -EBADF;
 137
 138        if (!hfi1_sdma_comp_ring_size)
 139                return -EINVAL;
 140
 141        dd = uctxt->dd;
 142
 143        pq = kzalloc(sizeof(*pq), GFP_KERNEL);
 144        if (!pq)
 145                return -ENOMEM;
 146        pq->dd = dd;
 147        pq->ctxt = uctxt->ctxt;
 148        pq->subctxt = fd->subctxt;
 149        pq->n_max_reqs = hfi1_sdma_comp_ring_size;
 150        atomic_set(&pq->n_reqs, 0);
 151        init_waitqueue_head(&pq->wait);
 152        atomic_set(&pq->n_locked, 0);
 153
 154        iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue,
 155                    activate_packet_queue, NULL, NULL);
 156        pq->reqidx = 0;
 157
 158        pq->reqs = kcalloc(hfi1_sdma_comp_ring_size,
 159                           sizeof(*pq->reqs),
 160                           GFP_KERNEL);
 161        if (!pq->reqs)
 162                goto pq_reqs_nomem;
 163
 164        pq->req_in_use = kcalloc(BITS_TO_LONGS(hfi1_sdma_comp_ring_size),
 165                                 sizeof(*pq->req_in_use),
 166                                 GFP_KERNEL);
 167        if (!pq->req_in_use)
 168                goto pq_reqs_no_in_use;
 169
 170        snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt,
 171                 fd->subctxt);
 172        pq->txreq_cache = kmem_cache_create(buf,
 173                                            sizeof(struct user_sdma_txreq),
 174                                            L1_CACHE_BYTES,
 175                                            SLAB_HWCACHE_ALIGN,
 176                                            NULL);
 177        if (!pq->txreq_cache) {
 178                dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n",
 179                           uctxt->ctxt);
 180                goto pq_txreq_nomem;
 181        }
 182
 183        cq = kzalloc(sizeof(*cq), GFP_KERNEL);
 184        if (!cq)
 185                goto cq_nomem;
 186
 187        cq->comps = vmalloc_user(PAGE_ALIGN(sizeof(*cq->comps)
 188                                 * hfi1_sdma_comp_ring_size));
 189        if (!cq->comps)
 190                goto cq_comps_nomem;
 191
 192        cq->nentries = hfi1_sdma_comp_ring_size;
 193
 194        ret = hfi1_mmu_rb_register(pq, &sdma_rb_ops, dd->pport->hfi1_wq,
 195                                   &pq->handler);
 196        if (ret) {
 197                dd_dev_err(dd, "Failed to register with MMU %d", ret);
 198                goto pq_mmu_fail;
 199        }
 200
 201        rcu_assign_pointer(fd->pq, pq);
 202        fd->cq = cq;
 203
 204        return 0;
 205
 206pq_mmu_fail:
 207        vfree(cq->comps);
 208cq_comps_nomem:
 209        kfree(cq);
 210cq_nomem:
 211        kmem_cache_destroy(pq->txreq_cache);
 212pq_txreq_nomem:
 213        kfree(pq->req_in_use);
 214pq_reqs_no_in_use:
 215        kfree(pq->reqs);
 216pq_reqs_nomem:
 217        kfree(pq);
 218
 219        return ret;
 220}
 221
 222static void flush_pq_iowait(struct hfi1_user_sdma_pkt_q *pq)
 223{
 224        unsigned long flags;
 225        seqlock_t *lock = pq->busy.lock;
 226
 227        if (!lock)
 228                return;
 229        write_seqlock_irqsave(lock, flags);
 230        if (!list_empty(&pq->busy.list)) {
 231                list_del_init(&pq->busy.list);
 232                pq->busy.lock = NULL;
 233        }
 234        write_sequnlock_irqrestore(lock, flags);
 235}
 236
 237int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd,
 238                               struct hfi1_ctxtdata *uctxt)
 239{
 240        struct hfi1_user_sdma_pkt_q *pq;
 241
 242        trace_hfi1_sdma_user_free_queues(uctxt->dd, uctxt->ctxt, fd->subctxt);
 243
 244        spin_lock(&fd->pq_rcu_lock);
 245        pq = srcu_dereference_check(fd->pq, &fd->pq_srcu,
 246                                    lockdep_is_held(&fd->pq_rcu_lock));
 247        if (pq) {
 248                rcu_assign_pointer(fd->pq, NULL);
 249                spin_unlock(&fd->pq_rcu_lock);
 250                synchronize_srcu(&fd->pq_srcu);
 251                /* at this point there can be no more new requests */
 252                if (pq->handler)
 253                        hfi1_mmu_rb_unregister(pq->handler);
 254                iowait_sdma_drain(&pq->busy);
 255                /* Wait until all requests have been freed. */
 256                wait_event_interruptible(
 257                        pq->wait,
 258                        !atomic_read(&pq->n_reqs));
 259                kfree(pq->reqs);
 260                kfree(pq->req_in_use);
 261                kmem_cache_destroy(pq->txreq_cache);
 262                flush_pq_iowait(pq);
 263                kfree(pq);
 264        } else {
 265                spin_unlock(&fd->pq_rcu_lock);
 266        }
 267        if (fd->cq) {
 268                vfree(fd->cq->comps);
 269                kfree(fd->cq);
 270                fd->cq = NULL;
 271        }
 272        return 0;
 273}
 274
 275static u8 dlid_to_selector(u16 dlid)
 276{
 277        static u8 mapping[256];
 278        static int initialized;
 279        static u8 next;
 280        int hash;
 281
 282        if (!initialized) {
 283                memset(mapping, 0xFF, 256);
 284                initialized = 1;
 285        }
 286
 287        hash = ((dlid >> 8) ^ dlid) & 0xFF;
 288        if (mapping[hash] == 0xFF) {
 289                mapping[hash] = next;
 290                next = (next + 1) & 0x7F;
 291        }
 292
 293        return mapping[hash];
 294}
 295
 296/**
 297 * hfi1_user_sdma_process_request() - Process and start a user sdma request
 298 * @fd: valid file descriptor
 299 * @iovec: array of io vectors to process
 300 * @dim: overall iovec array size
 301 * @count: number of io vector array entries processed
 302 */
 303int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
 304                                   struct iovec *iovec, unsigned long dim,
 305                                   unsigned long *count)
 306{
 307        int ret = 0, i;
 308        struct hfi1_ctxtdata *uctxt = fd->uctxt;
 309        struct hfi1_user_sdma_pkt_q *pq =
 310                srcu_dereference(fd->pq, &fd->pq_srcu);
 311        struct hfi1_user_sdma_comp_q *cq = fd->cq;
 312        struct hfi1_devdata *dd = pq->dd;
 313        unsigned long idx = 0;
 314        u8 pcount = initial_pkt_count;
 315        struct sdma_req_info info;
 316        struct user_sdma_request *req;
 317        u8 opcode, sc, vl;
 318        u16 pkey;
 319        u32 slid;
 320        u16 dlid;
 321        u32 selector;
 322
 323        if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) {
 324                hfi1_cdbg(
 325                   SDMA,
 326                   "[%u:%u:%u] First vector not big enough for header %lu/%lu",
 327                   dd->unit, uctxt->ctxt, fd->subctxt,
 328                   iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr));
 329                return -EINVAL;
 330        }
 331        ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info));
 332        if (ret) {
 333                hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)",
 334                          dd->unit, uctxt->ctxt, fd->subctxt, ret);
 335                return -EFAULT;
 336        }
 337
 338        trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt,
 339                                     (u16 *)&info);
 340        if (info.comp_idx >= hfi1_sdma_comp_ring_size) {
 341                hfi1_cdbg(SDMA,
 342                          "[%u:%u:%u:%u] Invalid comp index",
 343                          dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
 344                return -EINVAL;
 345        }
 346
 347        /*
 348         * Sanity check the header io vector count.  Need at least 1 vector
 349         * (header) and cannot be larger than the actual io vector count.
 350         */
 351        if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) {
 352                hfi1_cdbg(SDMA,
 353                          "[%u:%u:%u:%u] Invalid iov count %d, dim %ld",
 354                          dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx,
 355                          req_iovcnt(info.ctrl), dim);
 356                return -EINVAL;
 357        }
 358
 359        if (!info.fragsize) {
 360                hfi1_cdbg(SDMA,
 361                          "[%u:%u:%u:%u] Request does not specify fragsize",
 362                          dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
 363                return -EINVAL;
 364        }
 365
 366        /* Try to claim the request. */
 367        if (test_and_set_bit(info.comp_idx, pq->req_in_use)) {
 368                hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use",
 369                          dd->unit, uctxt->ctxt, fd->subctxt,
 370                          info.comp_idx);
 371                return -EBADSLT;
 372        }
 373        /*
 374         * All safety checks have been done and this request has been claimed.
 375         */
 376        trace_hfi1_sdma_user_process_request(dd, uctxt->ctxt, fd->subctxt,
 377                                             info.comp_idx);
 378        req = pq->reqs + info.comp_idx;
 379        req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */
 380        req->data_len  = 0;
 381        req->pq = pq;
 382        req->cq = cq;
 383        req->ahg_idx = -1;
 384        req->iov_idx = 0;
 385        req->sent = 0;
 386        req->seqnum = 0;
 387        req->seqcomp = 0;
 388        req->seqsubmitted = 0;
 389        req->tids = NULL;
 390        req->has_error = 0;
 391        INIT_LIST_HEAD(&req->txps);
 392
 393        memcpy(&req->info, &info, sizeof(info));
 394
 395        /* The request is initialized, count it */
 396        atomic_inc(&pq->n_reqs);
 397
 398        if (req_opcode(info.ctrl) == EXPECTED) {
 399                /* expected must have a TID info and at least one data vector */
 400                if (req->data_iovs < 2) {
 401                        SDMA_DBG(req,
 402                                 "Not enough vectors for expected request");
 403                        ret = -EINVAL;
 404                        goto free_req;
 405                }
 406                req->data_iovs--;
 407        }
 408
 409        if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) {
 410                SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs,
 411                         MAX_VECTORS_PER_REQ);
 412                ret = -EINVAL;
 413                goto free_req;
 414        }
 415        /* Copy the header from the user buffer */
 416        ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info),
 417                             sizeof(req->hdr));
 418        if (ret) {
 419                SDMA_DBG(req, "Failed to copy header template (%d)", ret);
 420                ret = -EFAULT;
 421                goto free_req;
 422        }
 423
 424        /* If Static rate control is not enabled, sanitize the header. */
 425        if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL))
 426                req->hdr.pbc[2] = 0;
 427
 428        /* Validate the opcode. Do not trust packets from user space blindly. */
 429        opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff;
 430        if ((opcode & USER_OPCODE_CHECK_MASK) !=
 431             USER_OPCODE_CHECK_VAL) {
 432                SDMA_DBG(req, "Invalid opcode (%d)", opcode);
 433                ret = -EINVAL;
 434                goto free_req;
 435        }
 436        /*
 437         * Validate the vl. Do not trust packets from user space blindly.
 438         * VL comes from PBC, SC comes from LRH, and the VL needs to
 439         * match the SC look up.
 440         */
 441        vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF;
 442        sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) |
 443              (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4));
 444        if (vl >= dd->pport->vls_operational ||
 445            vl != sc_to_vlt(dd, sc)) {
 446                SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl);
 447                ret = -EINVAL;
 448                goto free_req;
 449        }
 450
 451        /* Checking P_KEY for requests from user-space */
 452        pkey = (u16)be32_to_cpu(req->hdr.bth[0]);
 453        slid = be16_to_cpu(req->hdr.lrh[3]);
 454        if (egress_pkey_check(dd->pport, slid, pkey, sc, PKEY_CHECK_INVALID)) {
 455                ret = -EINVAL;
 456                goto free_req;
 457        }
 458
 459        /*
 460         * Also should check the BTH.lnh. If it says the next header is GRH then
 461         * the RXE parsing will be off and will land in the middle of the KDETH
 462         * or miss it entirely.
 463         */
 464        if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) {
 465                SDMA_DBG(req, "User tried to pass in a GRH");
 466                ret = -EINVAL;
 467                goto free_req;
 468        }
 469
 470        req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]);
 471        /*
 472         * Calculate the initial TID offset based on the values of
 473         * KDETH.OFFSET and KDETH.OM that are passed in.
 474         */
 475        req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) *
 476                (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
 477                 KDETH_OM_LARGE : KDETH_OM_SMALL);
 478        trace_hfi1_sdma_user_initial_tidoffset(dd, uctxt->ctxt, fd->subctxt,
 479                                               info.comp_idx, req->tidoffset);
 480        idx++;
 481
 482        /* Save all the IO vector structures */
 483        for (i = 0; i < req->data_iovs; i++) {
 484                req->iovs[i].offset = 0;
 485                INIT_LIST_HEAD(&req->iovs[i].list);
 486                memcpy(&req->iovs[i].iov,
 487                       iovec + idx++,
 488                       sizeof(req->iovs[i].iov));
 489                ret = pin_vector_pages(req, &req->iovs[i]);
 490                if (ret) {
 491                        req->data_iovs = i;
 492                        goto free_req;
 493                }
 494                req->data_len += req->iovs[i].iov.iov_len;
 495        }
 496        trace_hfi1_sdma_user_data_length(dd, uctxt->ctxt, fd->subctxt,
 497                                         info.comp_idx, req->data_len);
 498        if (pcount > req->info.npkts)
 499                pcount = req->info.npkts;
 500        /*
 501         * Copy any TID info
 502         * User space will provide the TID info only when the
 503         * request type is EXPECTED. This is true even if there is
 504         * only one packet in the request and the header is already
 505         * setup. The reason for the singular TID case is that the
 506         * driver needs to perform safety checks.
 507         */
 508        if (req_opcode(req->info.ctrl) == EXPECTED) {
 509                u16 ntids = iovec[idx].iov_len / sizeof(*req->tids);
 510                u32 *tmp;
 511
 512                if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) {
 513                        ret = -EINVAL;
 514                        goto free_req;
 515                }
 516
 517                /*
 518                 * We have to copy all of the tids because they may vary
 519                 * in size and, therefore, the TID count might not be
 520                 * equal to the pkt count. However, there is no way to
 521                 * tell at this point.
 522                 */
 523                tmp = memdup_user(iovec[idx].iov_base,
 524                                  ntids * sizeof(*req->tids));
 525                if (IS_ERR(tmp)) {
 526                        ret = PTR_ERR(tmp);
 527                        SDMA_DBG(req, "Failed to copy %d TIDs (%d)",
 528                                 ntids, ret);
 529                        goto free_req;
 530                }
 531                req->tids = tmp;
 532                req->n_tids = ntids;
 533                req->tididx = 0;
 534                idx++;
 535        }
 536
 537        dlid = be16_to_cpu(req->hdr.lrh[1]);
 538        selector = dlid_to_selector(dlid);
 539        selector += uctxt->ctxt + fd->subctxt;
 540        req->sde = sdma_select_user_engine(dd, selector, vl);
 541
 542        if (!req->sde || !sdma_running(req->sde)) {
 543                ret = -ECOMM;
 544                goto free_req;
 545        }
 546
 547        /* We don't need an AHG entry if the request contains only one packet */
 548        if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG))
 549                req->ahg_idx = sdma_ahg_alloc(req->sde);
 550
 551        set_comp_state(pq, cq, info.comp_idx, QUEUED, 0);
 552        pq->state = SDMA_PKT_Q_ACTIVE;
 553
 554        /*
 555         * This is a somewhat blocking send implementation.
 556         * The driver will block the caller until all packets of the
 557         * request have been submitted to the SDMA engine. However, it
 558         * will not wait for send completions.
 559         */
 560        while (req->seqsubmitted != req->info.npkts) {
 561                ret = user_sdma_send_pkts(req, pcount);
 562                if (ret < 0) {
 563                        int we_ret;
 564
 565                        if (ret != -EBUSY)
 566                                goto free_req;
 567                        we_ret = wait_event_interruptible_timeout(
 568                                pq->busy.wait_dma,
 569                                pq->state == SDMA_PKT_Q_ACTIVE,
 570                                msecs_to_jiffies(
 571                                        SDMA_IOWAIT_TIMEOUT));
 572                        trace_hfi1_usdma_we(pq, we_ret);
 573                        if (we_ret <= 0)
 574                                flush_pq_iowait(pq);
 575                }
 576        }
 577        *count += idx;
 578        return 0;
 579free_req:
 580        /*
 581         * If the submitted seqsubmitted == npkts, the completion routine
 582         * controls the final state.  If sequbmitted < npkts, wait for any
 583         * outstanding packets to finish before cleaning up.
 584         */
 585        if (req->seqsubmitted < req->info.npkts) {
 586                if (req->seqsubmitted)
 587                        wait_event(pq->busy.wait_dma,
 588                                   (req->seqcomp == req->seqsubmitted - 1));
 589                user_sdma_free_request(req, true);
 590                pq_update(pq);
 591                set_comp_state(pq, cq, info.comp_idx, ERROR, ret);
 592        }
 593        return ret;
 594}
 595
 596static inline u32 compute_data_length(struct user_sdma_request *req,
 597                                      struct user_sdma_txreq *tx)
 598{
 599        /*
 600         * Determine the proper size of the packet data.
 601         * The size of the data of the first packet is in the header
 602         * template. However, it includes the header and ICRC, which need
 603         * to be subtracted.
 604         * The minimum representable packet data length in a header is 4 bytes,
 605         * therefore, when the data length request is less than 4 bytes, there's
 606         * only one packet, and the packet data length is equal to that of the
 607         * request data length.
 608         * The size of the remaining packets is the minimum of the frag
 609         * size (MTU) or remaining data in the request.
 610         */
 611        u32 len;
 612
 613        if (!req->seqnum) {
 614                if (req->data_len < sizeof(u32))
 615                        len = req->data_len;
 616                else
 617                        len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) -
 618                               (sizeof(tx->hdr) - 4));
 619        } else if (req_opcode(req->info.ctrl) == EXPECTED) {
 620                u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) *
 621                        PAGE_SIZE;
 622                /*
 623                 * Get the data length based on the remaining space in the
 624                 * TID pair.
 625                 */
 626                len = min(tidlen - req->tidoffset, (u32)req->info.fragsize);
 627                /* If we've filled up the TID pair, move to the next one. */
 628                if (unlikely(!len) && ++req->tididx < req->n_tids &&
 629                    req->tids[req->tididx]) {
 630                        tidlen = EXP_TID_GET(req->tids[req->tididx],
 631                                             LEN) * PAGE_SIZE;
 632                        req->tidoffset = 0;
 633                        len = min_t(u32, tidlen, req->info.fragsize);
 634                }
 635                /*
 636                 * Since the TID pairs map entire pages, make sure that we
 637                 * are not going to try to send more data that we have
 638                 * remaining.
 639                 */
 640                len = min(len, req->data_len - req->sent);
 641        } else {
 642                len = min(req->data_len - req->sent, (u32)req->info.fragsize);
 643        }
 644        trace_hfi1_sdma_user_compute_length(req->pq->dd,
 645                                            req->pq->ctxt,
 646                                            req->pq->subctxt,
 647                                            req->info.comp_idx,
 648                                            len);
 649        return len;
 650}
 651
 652static inline u32 pad_len(u32 len)
 653{
 654        if (len & (sizeof(u32) - 1))
 655                len += sizeof(u32) - (len & (sizeof(u32) - 1));
 656        return len;
 657}
 658
 659static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len)
 660{
 661        /* (Size of complete header - size of PBC) + 4B ICRC + data length */
 662        return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len);
 663}
 664
 665static int user_sdma_txadd_ahg(struct user_sdma_request *req,
 666                               struct user_sdma_txreq *tx,
 667                               u32 datalen)
 668{
 669        int ret;
 670        u16 pbclen = le16_to_cpu(req->hdr.pbc[0]);
 671        u32 lrhlen = get_lrh_len(req->hdr, pad_len(datalen));
 672        struct hfi1_user_sdma_pkt_q *pq = req->pq;
 673
 674        /*
 675         * Copy the request header into the tx header
 676         * because the HW needs a cacheline-aligned
 677         * address.
 678         * This copy can be optimized out if the hdr
 679         * member of user_sdma_request were also
 680         * cacheline aligned.
 681         */
 682        memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr));
 683        if (PBC2LRH(pbclen) != lrhlen) {
 684                pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
 685                tx->hdr.pbc[0] = cpu_to_le16(pbclen);
 686        }
 687        ret = check_header_template(req, &tx->hdr, lrhlen, datalen);
 688        if (ret)
 689                return ret;
 690        ret = sdma_txinit_ahg(&tx->txreq, SDMA_TXREQ_F_AHG_COPY,
 691                              sizeof(tx->hdr) + datalen, req->ahg_idx,
 692                              0, NULL, 0, user_sdma_txreq_cb);
 693        if (ret)
 694                return ret;
 695        ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, &tx->hdr, sizeof(tx->hdr));
 696        if (ret)
 697                sdma_txclean(pq->dd, &tx->txreq);
 698        return ret;
 699}
 700
 701static int user_sdma_txadd(struct user_sdma_request *req,
 702                           struct user_sdma_txreq *tx,
 703                           struct user_sdma_iovec *iovec, u32 datalen,
 704                           u32 *queued_ptr, u32 *data_sent_ptr,
 705                           u64 *iov_offset_ptr)
 706{
 707        int ret;
 708        unsigned int pageidx, len;
 709        unsigned long base, offset;
 710        u64 iov_offset = *iov_offset_ptr;
 711        u32 queued = *queued_ptr, data_sent = *data_sent_ptr;
 712        struct hfi1_user_sdma_pkt_q *pq = req->pq;
 713
 714        base = (unsigned long)iovec->iov.iov_base;
 715        offset = offset_in_page(base + iovec->offset + iov_offset);
 716        pageidx = (((iovec->offset + iov_offset + base) - (base & PAGE_MASK)) >>
 717                   PAGE_SHIFT);
 718        len = offset + req->info.fragsize > PAGE_SIZE ?
 719                PAGE_SIZE - offset : req->info.fragsize;
 720        len = min((datalen - queued), len);
 721        ret = sdma_txadd_page(pq->dd, &tx->txreq, iovec->pages[pageidx],
 722                              offset, len);
 723        if (ret) {
 724                SDMA_DBG(req, "SDMA txreq add page failed %d\n", ret);
 725                return ret;
 726        }
 727        iov_offset += len;
 728        queued += len;
 729        data_sent += len;
 730        if (unlikely(queued < datalen && pageidx == iovec->npages &&
 731                     req->iov_idx < req->data_iovs - 1)) {
 732                iovec->offset += iov_offset;
 733                iovec = &req->iovs[++req->iov_idx];
 734                iov_offset = 0;
 735        }
 736
 737        *queued_ptr = queued;
 738        *data_sent_ptr = data_sent;
 739        *iov_offset_ptr = iov_offset;
 740        return ret;
 741}
 742
 743static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts)
 744{
 745        int ret = 0;
 746        u16 count;
 747        unsigned npkts = 0;
 748        struct user_sdma_txreq *tx = NULL;
 749        struct hfi1_user_sdma_pkt_q *pq = NULL;
 750        struct user_sdma_iovec *iovec = NULL;
 751
 752        if (!req->pq)
 753                return -EINVAL;
 754
 755        pq = req->pq;
 756
 757        /* If tx completion has reported an error, we are done. */
 758        if (READ_ONCE(req->has_error))
 759                return -EFAULT;
 760
 761        /*
 762         * Check if we might have sent the entire request already
 763         */
 764        if (unlikely(req->seqnum == req->info.npkts)) {
 765                if (!list_empty(&req->txps))
 766                        goto dosend;
 767                return ret;
 768        }
 769
 770        if (!maxpkts || maxpkts > req->info.npkts - req->seqnum)
 771                maxpkts = req->info.npkts - req->seqnum;
 772
 773        while (npkts < maxpkts) {
 774                u32 datalen = 0, queued = 0, data_sent = 0;
 775                u64 iov_offset = 0;
 776
 777                /*
 778                 * Check whether any of the completions have come back
 779                 * with errors. If so, we are not going to process any
 780                 * more packets from this request.
 781                 */
 782                if (READ_ONCE(req->has_error))
 783                        return -EFAULT;
 784
 785                tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL);
 786                if (!tx)
 787                        return -ENOMEM;
 788
 789                tx->flags = 0;
 790                tx->req = req;
 791                INIT_LIST_HEAD(&tx->list);
 792
 793                /*
 794                 * For the last packet set the ACK request
 795                 * and disable header suppression.
 796                 */
 797                if (req->seqnum == req->info.npkts - 1)
 798                        tx->flags |= (TXREQ_FLAGS_REQ_ACK |
 799                                      TXREQ_FLAGS_REQ_DISABLE_SH);
 800
 801                /*
 802                 * Calculate the payload size - this is min of the fragment
 803                 * (MTU) size or the remaining bytes in the request but only
 804                 * if we have payload data.
 805                 */
 806                if (req->data_len) {
 807                        iovec = &req->iovs[req->iov_idx];
 808                        if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) {
 809                                if (++req->iov_idx == req->data_iovs) {
 810                                        ret = -EFAULT;
 811                                        goto free_tx;
 812                                }
 813                                iovec = &req->iovs[req->iov_idx];
 814                                WARN_ON(iovec->offset);
 815                        }
 816
 817                        datalen = compute_data_length(req, tx);
 818
 819                        /*
 820                         * Disable header suppression for the payload <= 8DWS.
 821                         * If there is an uncorrectable error in the receive
 822                         * data FIFO when the received payload size is less than
 823                         * or equal to 8DWS then the RxDmaDataFifoRdUncErr is
 824                         * not reported.There is set RHF.EccErr if the header
 825                         * is not suppressed.
 826                         */
 827                        if (!datalen) {
 828                                SDMA_DBG(req,
 829                                         "Request has data but pkt len is 0");
 830                                ret = -EFAULT;
 831                                goto free_tx;
 832                        } else if (datalen <= 32) {
 833                                tx->flags |= TXREQ_FLAGS_REQ_DISABLE_SH;
 834                        }
 835                }
 836
 837                if (req->ahg_idx >= 0) {
 838                        if (!req->seqnum) {
 839                                ret = user_sdma_txadd_ahg(req, tx, datalen);
 840                                if (ret)
 841                                        goto free_tx;
 842                        } else {
 843                                int changes;
 844
 845                                changes = set_txreq_header_ahg(req, tx,
 846                                                               datalen);
 847                                if (changes < 0) {
 848                                        ret = changes;
 849                                        goto free_tx;
 850                                }
 851                        }
 852                } else {
 853                        ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) +
 854                                          datalen, user_sdma_txreq_cb);
 855                        if (ret)
 856                                goto free_tx;
 857                        /*
 858                         * Modify the header for this packet. This only needs
 859                         * to be done if we are not going to use AHG. Otherwise,
 860                         * the HW will do it based on the changes we gave it
 861                         * during sdma_txinit_ahg().
 862                         */
 863                        ret = set_txreq_header(req, tx, datalen);
 864                        if (ret)
 865                                goto free_txreq;
 866                }
 867
 868                /*
 869                 * If the request contains any data vectors, add up to
 870                 * fragsize bytes to the descriptor.
 871                 */
 872                while (queued < datalen &&
 873                       (req->sent + data_sent) < req->data_len) {
 874                        ret = user_sdma_txadd(req, tx, iovec, datalen,
 875                                              &queued, &data_sent, &iov_offset);
 876                        if (ret)
 877                                goto free_txreq;
 878                }
 879                /*
 880                 * The txreq was submitted successfully so we can update
 881                 * the counters.
 882                 */
 883                req->koffset += datalen;
 884                if (req_opcode(req->info.ctrl) == EXPECTED)
 885                        req->tidoffset += datalen;
 886                req->sent += data_sent;
 887                if (req->data_len)
 888                        iovec->offset += iov_offset;
 889                list_add_tail(&tx->txreq.list, &req->txps);
 890                /*
 891                 * It is important to increment this here as it is used to
 892                 * generate the BTH.PSN and, therefore, can't be bulk-updated
 893                 * outside of the loop.
 894                 */
 895                tx->seqnum = req->seqnum++;
 896                npkts++;
 897        }
 898dosend:
 899        ret = sdma_send_txlist(req->sde,
 900                               iowait_get_ib_work(&pq->busy),
 901                               &req->txps, &count);
 902        req->seqsubmitted += count;
 903        if (req->seqsubmitted == req->info.npkts) {
 904                /*
 905                 * The txreq has already been submitted to the HW queue
 906                 * so we can free the AHG entry now. Corruption will not
 907                 * happen due to the sequential manner in which
 908                 * descriptors are processed.
 909                 */
 910                if (req->ahg_idx >= 0)
 911                        sdma_ahg_free(req->sde, req->ahg_idx);
 912        }
 913        return ret;
 914
 915free_txreq:
 916        sdma_txclean(pq->dd, &tx->txreq);
 917free_tx:
 918        kmem_cache_free(pq->txreq_cache, tx);
 919        return ret;
 920}
 921
 922static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages)
 923{
 924        struct evict_data evict_data;
 925
 926        evict_data.cleared = 0;
 927        evict_data.target = npages;
 928        hfi1_mmu_rb_evict(pq->handler, &evict_data);
 929        return evict_data.cleared;
 930}
 931
 932static int pin_sdma_pages(struct user_sdma_request *req,
 933                          struct user_sdma_iovec *iovec,
 934                          struct sdma_mmu_node *node,
 935                          int npages)
 936{
 937        int pinned, cleared;
 938        struct page **pages;
 939        struct hfi1_user_sdma_pkt_q *pq = req->pq;
 940
 941        pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
 942        if (!pages)
 943                return -ENOMEM;
 944        memcpy(pages, node->pages, node->npages * sizeof(*pages));
 945
 946        npages -= node->npages;
 947retry:
 948        if (!hfi1_can_pin_pages(pq->dd, current->mm,
 949                                atomic_read(&pq->n_locked), npages)) {
 950                cleared = sdma_cache_evict(pq, npages);
 951                if (cleared >= npages)
 952                        goto retry;
 953        }
 954        pinned = hfi1_acquire_user_pages(current->mm,
 955                                         ((unsigned long)iovec->iov.iov_base +
 956                                         (node->npages * PAGE_SIZE)), npages, 0,
 957                                         pages + node->npages);
 958        if (pinned < 0) {
 959                kfree(pages);
 960                return pinned;
 961        }
 962        if (pinned != npages) {
 963                unpin_vector_pages(current->mm, pages, node->npages, pinned);
 964                return -EFAULT;
 965        }
 966        kfree(node->pages);
 967        node->rb.len = iovec->iov.iov_len;
 968        node->pages = pages;
 969        atomic_add(pinned, &pq->n_locked);
 970        return pinned;
 971}
 972
 973static void unpin_sdma_pages(struct sdma_mmu_node *node)
 974{
 975        if (node->npages) {
 976                unpin_vector_pages(mm_from_sdma_node(node), node->pages, 0,
 977                                   node->npages);
 978                atomic_sub(node->npages, &node->pq->n_locked);
 979        }
 980}
 981
 982static int pin_vector_pages(struct user_sdma_request *req,
 983                            struct user_sdma_iovec *iovec)
 984{
 985        int ret = 0, pinned, npages;
 986        struct hfi1_user_sdma_pkt_q *pq = req->pq;
 987        struct sdma_mmu_node *node = NULL;
 988        struct mmu_rb_node *rb_node;
 989        struct iovec *iov;
 990        bool extracted;
 991
 992        extracted =
 993                hfi1_mmu_rb_remove_unless_exact(pq->handler,
 994                                                (unsigned long)
 995                                                iovec->iov.iov_base,
 996                                                iovec->iov.iov_len, &rb_node);
 997        if (rb_node) {
 998                node = container_of(rb_node, struct sdma_mmu_node, rb);
 999                if (!extracted) {
1000                        atomic_inc(&node->refcount);
1001                        iovec->pages = node->pages;
1002                        iovec->npages = node->npages;
1003                        iovec->node = node;
1004                        return 0;
1005                }
1006        }
1007
1008        if (!node) {
1009                node = kzalloc(sizeof(*node), GFP_KERNEL);
1010                if (!node)
1011                        return -ENOMEM;
1012
1013                node->rb.addr = (unsigned long)iovec->iov.iov_base;
1014                node->pq = pq;
1015                atomic_set(&node->refcount, 0);
1016        }
1017
1018        iov = &iovec->iov;
1019        npages = num_user_pages((unsigned long)iov->iov_base, iov->iov_len);
1020        if (node->npages < npages) {
1021                pinned = pin_sdma_pages(req, iovec, node, npages);
1022                if (pinned < 0) {
1023                        ret = pinned;
1024                        goto bail;
1025                }
1026                node->npages += pinned;
1027                npages = node->npages;
1028        }
1029        iovec->pages = node->pages;
1030        iovec->npages = npages;
1031        iovec->node = node;
1032
1033        ret = hfi1_mmu_rb_insert(req->pq->handler, &node->rb);
1034        if (ret) {
1035                iovec->node = NULL;
1036                goto bail;
1037        }
1038        return 0;
1039bail:
1040        unpin_sdma_pages(node);
1041        kfree(node);
1042        return ret;
1043}
1044
1045static void unpin_vector_pages(struct mm_struct *mm, struct page **pages,
1046                               unsigned start, unsigned npages)
1047{
1048        hfi1_release_user_pages(mm, pages + start, npages, false);
1049        kfree(pages);
1050}
1051
1052static int check_header_template(struct user_sdma_request *req,
1053                                 struct hfi1_pkt_header *hdr, u32 lrhlen,
1054                                 u32 datalen)
1055{
1056        /*
1057         * Perform safety checks for any type of packet:
1058         *    - transfer size is multiple of 64bytes
1059         *    - packet length is multiple of 4 bytes
1060         *    - packet length is not larger than MTU size
1061         *
1062         * These checks are only done for the first packet of the
1063         * transfer since the header is "given" to us by user space.
1064         * For the remainder of the packets we compute the values.
1065         */
1066        if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 ||
1067            lrhlen > get_lrh_len(*hdr, req->info.fragsize))
1068                return -EINVAL;
1069
1070        if (req_opcode(req->info.ctrl) == EXPECTED) {
1071                /*
1072                 * The header is checked only on the first packet. Furthermore,
1073                 * we ensure that at least one TID entry is copied when the
1074                 * request is submitted. Therefore, we don't have to verify that
1075                 * tididx points to something sane.
1076                 */
1077                u32 tidval = req->tids[req->tididx],
1078                        tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE,
1079                        tididx = EXP_TID_GET(tidval, IDX),
1080                        tidctrl = EXP_TID_GET(tidval, CTRL),
1081                        tidoff;
1082                __le32 kval = hdr->kdeth.ver_tid_offset;
1083
1084                tidoff = KDETH_GET(kval, OFFSET) *
1085                          (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
1086                           KDETH_OM_LARGE : KDETH_OM_SMALL);
1087                /*
1088                 * Expected receive packets have the following
1089                 * additional checks:
1090                 *     - offset is not larger than the TID size
1091                 *     - TIDCtrl values match between header and TID array
1092                 *     - TID indexes match between header and TID array
1093                 */
1094                if ((tidoff + datalen > tidlen) ||
1095                    KDETH_GET(kval, TIDCTRL) != tidctrl ||
1096                    KDETH_GET(kval, TID) != tididx)
1097                        return -EINVAL;
1098        }
1099        return 0;
1100}
1101
1102/*
1103 * Correctly set the BTH.PSN field based on type of
1104 * transfer - eager packets can just increment the PSN but
1105 * expected packets encode generation and sequence in the
1106 * BTH.PSN field so just incrementing will result in errors.
1107 */
1108static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags)
1109{
1110        u32 val = be32_to_cpu(bthpsn),
1111                mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull :
1112                        0xffffffull),
1113                psn = val & mask;
1114        if (expct)
1115                psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) |
1116                        ((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK);
1117        else
1118                psn = psn + frags;
1119        return psn & mask;
1120}
1121
1122static int set_txreq_header(struct user_sdma_request *req,
1123                            struct user_sdma_txreq *tx, u32 datalen)
1124{
1125        struct hfi1_user_sdma_pkt_q *pq = req->pq;
1126        struct hfi1_pkt_header *hdr = &tx->hdr;
1127        u8 omfactor; /* KDETH.OM */
1128        u16 pbclen;
1129        int ret;
1130        u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
1131
1132        /* Copy the header template to the request before modification */
1133        memcpy(hdr, &req->hdr, sizeof(*hdr));
1134
1135        /*
1136         * Check if the PBC and LRH length are mismatched. If so
1137         * adjust both in the header.
1138         */
1139        pbclen = le16_to_cpu(hdr->pbc[0]);
1140        if (PBC2LRH(pbclen) != lrhlen) {
1141                pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
1142                hdr->pbc[0] = cpu_to_le16(pbclen);
1143                hdr->lrh[2] = cpu_to_be16(lrhlen >> 2);
1144                /*
1145                 * Third packet
1146                 * This is the first packet in the sequence that has
1147                 * a "static" size that can be used for the rest of
1148                 * the packets (besides the last one).
1149                 */
1150                if (unlikely(req->seqnum == 2)) {
1151                        /*
1152                         * From this point on the lengths in both the
1153                         * PBC and LRH are the same until the last
1154                         * packet.
1155                         * Adjust the template so we don't have to update
1156                         * every packet
1157                         */
1158                        req->hdr.pbc[0] = hdr->pbc[0];
1159                        req->hdr.lrh[2] = hdr->lrh[2];
1160                }
1161        }
1162        /*
1163         * We only have to modify the header if this is not the
1164         * first packet in the request. Otherwise, we use the
1165         * header given to us.
1166         */
1167        if (unlikely(!req->seqnum)) {
1168                ret = check_header_template(req, hdr, lrhlen, datalen);
1169                if (ret)
1170                        return ret;
1171                goto done;
1172        }
1173
1174        hdr->bth[2] = cpu_to_be32(
1175                set_pkt_bth_psn(hdr->bth[2],
1176                                (req_opcode(req->info.ctrl) == EXPECTED),
1177                                req->seqnum));
1178
1179        /* Set ACK request on last packet */
1180        if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
1181                hdr->bth[2] |= cpu_to_be32(1UL << 31);
1182
1183        /* Set the new offset */
1184        hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset);
1185        /* Expected packets have to fill in the new TID information */
1186        if (req_opcode(req->info.ctrl) == EXPECTED) {
1187                tidval = req->tids[req->tididx];
1188                /*
1189                 * If the offset puts us at the end of the current TID,
1190                 * advance everything.
1191                 */
1192                if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
1193                                         PAGE_SIZE)) {
1194                        req->tidoffset = 0;
1195                        /*
1196                         * Since we don't copy all the TIDs, all at once,
1197                         * we have to check again.
1198                         */
1199                        if (++req->tididx > req->n_tids - 1 ||
1200                            !req->tids[req->tididx]) {
1201                                return -EINVAL;
1202                        }
1203                        tidval = req->tids[req->tididx];
1204                }
1205                omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >=
1206                        KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE_SHIFT :
1207                        KDETH_OM_SMALL_SHIFT;
1208                /* Set KDETH.TIDCtrl based on value for this TID. */
1209                KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL,
1210                          EXP_TID_GET(tidval, CTRL));
1211                /* Set KDETH.TID based on value for this TID */
1212                KDETH_SET(hdr->kdeth.ver_tid_offset, TID,
1213                          EXP_TID_GET(tidval, IDX));
1214                /* Clear KDETH.SH when DISABLE_SH flag is set */
1215                if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH))
1216                        KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0);
1217                /*
1218                 * Set the KDETH.OFFSET and KDETH.OM based on size of
1219                 * transfer.
1220                 */
1221                trace_hfi1_sdma_user_tid_info(
1222                        pq->dd, pq->ctxt, pq->subctxt, req->info.comp_idx,
1223                        req->tidoffset, req->tidoffset >> omfactor,
1224                        omfactor != KDETH_OM_SMALL_SHIFT);
1225                KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET,
1226                          req->tidoffset >> omfactor);
1227                KDETH_SET(hdr->kdeth.ver_tid_offset, OM,
1228                          omfactor != KDETH_OM_SMALL_SHIFT);
1229        }
1230done:
1231        trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt,
1232                                    req->info.comp_idx, hdr, tidval);
1233        return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr));
1234}
1235
1236static int set_txreq_header_ahg(struct user_sdma_request *req,
1237                                struct user_sdma_txreq *tx, u32 datalen)
1238{
1239        u32 ahg[AHG_KDETH_ARRAY_SIZE];
1240        int idx = 0;
1241        u8 omfactor; /* KDETH.OM */
1242        struct hfi1_user_sdma_pkt_q *pq = req->pq;
1243        struct hfi1_pkt_header *hdr = &req->hdr;
1244        u16 pbclen = le16_to_cpu(hdr->pbc[0]);
1245        u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
1246        size_t array_size = ARRAY_SIZE(ahg);
1247
1248        if (PBC2LRH(pbclen) != lrhlen) {
1249                /* PBC.PbcLengthDWs */
1250                idx = ahg_header_set(ahg, idx, array_size, 0, 0, 12,
1251                                     (__force u16)cpu_to_le16(LRH2PBC(lrhlen)));
1252                if (idx < 0)
1253                        return idx;
1254                /* LRH.PktLen (we need the full 16 bits due to byte swap) */
1255                idx = ahg_header_set(ahg, idx, array_size, 3, 0, 16,
1256                                     (__force u16)cpu_to_be16(lrhlen >> 2));
1257                if (idx < 0)
1258                        return idx;
1259        }
1260
1261        /*
1262         * Do the common updates
1263         */
1264        /* BTH.PSN and BTH.A */
1265        val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) &
1266                (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff);
1267        if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
1268                val32 |= 1UL << 31;
1269        idx = ahg_header_set(ahg, idx, array_size, 6, 0, 16,
1270                             (__force u16)cpu_to_be16(val32 >> 16));
1271        if (idx < 0)
1272                return idx;
1273        idx = ahg_header_set(ahg, idx, array_size, 6, 16, 16,
1274                             (__force u16)cpu_to_be16(val32 & 0xffff));
1275        if (idx < 0)
1276                return idx;
1277        /* KDETH.Offset */
1278        idx = ahg_header_set(ahg, idx, array_size, 15, 0, 16,
1279                             (__force u16)cpu_to_le16(req->koffset & 0xffff));
1280        if (idx < 0)
1281                return idx;
1282        idx = ahg_header_set(ahg, idx, array_size, 15, 16, 16,
1283                             (__force u16)cpu_to_le16(req->koffset >> 16));
1284        if (idx < 0)
1285                return idx;
1286        if (req_opcode(req->info.ctrl) == EXPECTED) {
1287                __le16 val;
1288
1289                tidval = req->tids[req->tididx];
1290
1291                /*
1292                 * If the offset puts us at the end of the current TID,
1293                 * advance everything.
1294                 */
1295                if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
1296                                         PAGE_SIZE)) {
1297                        req->tidoffset = 0;
1298                        /*
1299                         * Since we don't copy all the TIDs, all at once,
1300                         * we have to check again.
1301                         */
1302                        if (++req->tididx > req->n_tids - 1 ||
1303                            !req->tids[req->tididx])
1304                                return -EINVAL;
1305                        tidval = req->tids[req->tididx];
1306                }
1307                omfactor = ((EXP_TID_GET(tidval, LEN) *
1308                                  PAGE_SIZE) >=
1309                                 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT :
1310                                 KDETH_OM_SMALL_SHIFT;
1311                /* KDETH.OM and KDETH.OFFSET (TID) */
1312                idx = ahg_header_set(
1313                                ahg, idx, array_size, 7, 0, 16,
1314                                ((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 |
1315                                ((req->tidoffset >> omfactor)
1316                                & 0x7fff)));
1317                if (idx < 0)
1318                        return idx;
1319                /* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */
1320                val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) |
1321                                   (EXP_TID_GET(tidval, IDX) & 0x3ff));
1322
1323                if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) {
1324                        val |= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
1325                                                      INTR) <<
1326                                            AHG_KDETH_INTR_SHIFT));
1327                } else {
1328                        val |= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ?
1329                               cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) :
1330                               cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
1331                                                      INTR) <<
1332                                             AHG_KDETH_INTR_SHIFT));
1333                }
1334
1335                idx = ahg_header_set(ahg, idx, array_size,
1336                                     7, 16, 14, (__force u16)val);
1337                if (idx < 0)
1338                        return idx;
1339        }
1340
1341        trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt,
1342                                        req->info.comp_idx, req->sde->this_idx,
1343                                        req->ahg_idx, ahg, idx, tidval);
1344        sdma_txinit_ahg(&tx->txreq,
1345                        SDMA_TXREQ_F_USE_AHG,
1346                        datalen, req->ahg_idx, idx,
1347                        ahg, sizeof(req->hdr),
1348                        user_sdma_txreq_cb);
1349
1350        return idx;
1351}
1352
1353/**
1354 * user_sdma_txreq_cb() - SDMA tx request completion callback.
1355 * @txreq: valid sdma tx request
1356 * @status: success/failure of request
1357 *
1358 * Called when the SDMA progress state machine gets notification that
1359 * the SDMA descriptors for this tx request have been processed by the
1360 * DMA engine. Called in interrupt context.
1361 * Only do work on completed sequences.
1362 */
1363static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
1364{
1365        struct user_sdma_txreq *tx =
1366                container_of(txreq, struct user_sdma_txreq, txreq);
1367        struct user_sdma_request *req;
1368        struct hfi1_user_sdma_pkt_q *pq;
1369        struct hfi1_user_sdma_comp_q *cq;
1370        enum hfi1_sdma_comp_state state = COMPLETE;
1371
1372        if (!tx->req)
1373                return;
1374
1375        req = tx->req;
1376        pq = req->pq;
1377        cq = req->cq;
1378
1379        if (status != SDMA_TXREQ_S_OK) {
1380                SDMA_DBG(req, "SDMA completion with error %d",
1381                         status);
1382                WRITE_ONCE(req->has_error, 1);
1383                state = ERROR;
1384        }
1385
1386        req->seqcomp = tx->seqnum;
1387        kmem_cache_free(pq->txreq_cache, tx);
1388
1389        /* sequence isn't complete?  We are done */
1390        if (req->seqcomp != req->info.npkts - 1)
1391                return;
1392
1393        user_sdma_free_request(req, false);
1394        set_comp_state(pq, cq, req->info.comp_idx, state, status);
1395        pq_update(pq);
1396}
1397
1398static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq)
1399{
1400        if (atomic_dec_and_test(&pq->n_reqs))
1401                wake_up(&pq->wait);
1402}
1403
1404static void user_sdma_free_request(struct user_sdma_request *req, bool unpin)
1405{
1406        int i;
1407
1408        if (!list_empty(&req->txps)) {
1409                struct sdma_txreq *t, *p;
1410
1411                list_for_each_entry_safe(t, p, &req->txps, list) {
1412                        struct user_sdma_txreq *tx =
1413                                container_of(t, struct user_sdma_txreq, txreq);
1414                        list_del_init(&t->list);
1415                        sdma_txclean(req->pq->dd, t);
1416                        kmem_cache_free(req->pq->txreq_cache, tx);
1417                }
1418        }
1419
1420        for (i = 0; i < req->data_iovs; i++) {
1421                struct sdma_mmu_node *node = req->iovs[i].node;
1422
1423                if (!node)
1424                        continue;
1425
1426                req->iovs[i].node = NULL;
1427
1428                if (unpin)
1429                        hfi1_mmu_rb_remove(req->pq->handler,
1430                                           &node->rb);
1431                else
1432                        atomic_dec(&node->refcount);
1433        }
1434
1435        kfree(req->tids);
1436        clear_bit(req->info.comp_idx, req->pq->req_in_use);
1437}
1438
1439static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
1440                                  struct hfi1_user_sdma_comp_q *cq,
1441                                  u16 idx, enum hfi1_sdma_comp_state state,
1442                                  int ret)
1443{
1444        if (state == ERROR)
1445                cq->comps[idx].errcode = -ret;
1446        smp_wmb(); /* make sure errcode is visible first */
1447        cq->comps[idx].status = state;
1448        trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt,
1449                                        idx, state, ret);
1450}
1451
1452static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
1453                           unsigned long len)
1454{
1455        return (bool)(node->addr == addr);
1456}
1457
1458static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode)
1459{
1460        struct sdma_mmu_node *node =
1461                container_of(mnode, struct sdma_mmu_node, rb);
1462
1463        atomic_inc(&node->refcount);
1464        return 0;
1465}
1466
1467/*
1468 * Return 1 to remove the node from the rb tree and call the remove op.
1469 *
1470 * Called with the rb tree lock held.
1471 */
1472static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode,
1473                         void *evict_arg, bool *stop)
1474{
1475        struct sdma_mmu_node *node =
1476                container_of(mnode, struct sdma_mmu_node, rb);
1477        struct evict_data *evict_data = evict_arg;
1478
1479        /* is this node still being used? */
1480        if (atomic_read(&node->refcount))
1481                return 0; /* keep this node */
1482
1483        /* this node will be evicted, add its pages to our count */
1484        evict_data->cleared += node->npages;
1485
1486        /* have enough pages been cleared? */
1487        if (evict_data->cleared >= evict_data->target)
1488                *stop = true;
1489
1490        return 1; /* remove this node */
1491}
1492
1493static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode)
1494{
1495        struct sdma_mmu_node *node =
1496                container_of(mnode, struct sdma_mmu_node, rb);
1497
1498        unpin_sdma_pages(node);
1499        kfree(node);
1500}
1501
1502static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode)
1503{
1504        struct sdma_mmu_node *node =
1505                container_of(mnode, struct sdma_mmu_node, rb);
1506
1507        if (!atomic_read(&node->refcount))
1508                return 1;
1509        return 0;
1510}
1511