LXR linux/drivers/infiniband/hw/hfi1/user

   1// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
   2/*
   3 * Copyright(c) 2020 - Cornelis Networks, Inc.
   4 * Copyright(c) 2015 - 2018 Intel Corporation.
   5 */
   6
   7#include <linux/mm.h>
   8#include <linux/types.h>
   9#include <linux/device.h>
  10#include <linux/dmapool.h>
  11#include <linux/slab.h>
  12#include <linux/list.h>
  13#include <linux/highmem.h>
  14#include <linux/io.h>
  15#include <linux/uio.h>
  16#include <linux/rbtree.h>
  17#include <linux/spinlock.h>
  18#include <linux/delay.h>
  19#include <linux/kthread.h>
  20#include <linux/mmu_context.h>
  21#include <linux/module.h>
  22#include <linux/vmalloc.h>
  23#include <linux/string.h>
  24
  25#include "hfi.h"
  26#include "sdma.h"
  27#include "mmu_rb.h"
  28#include "user_sdma.h"
  29#include "verbs.h"  /* for the headers */
  30#include "common.h" /* for struct hfi1_tid_info */
  31#include "trace.h"
  32
  33static uint hfi1_sdma_comp_ring_size = 128;
  34module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO);
  35MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128");
  36
  37static unsigned initial_pkt_count = 8;
  38
  39static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts);
  40static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status);
  41static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq);
  42static void user_sdma_free_request(struct user_sdma_request *req, bool unpin);
  43static int pin_vector_pages(struct user_sdma_request *req,
  44                            struct user_sdma_iovec *iovec);
  45static void unpin_vector_pages(struct mm_struct *mm, struct page **pages,
  46                               unsigned start, unsigned npages);
  47static int check_header_template(struct user_sdma_request *req,
  48                                 struct hfi1_pkt_header *hdr, u32 lrhlen,
  49                                 u32 datalen);
  50static int set_txreq_header(struct user_sdma_request *req,
  51                            struct user_sdma_txreq *tx, u32 datalen);
  52static int set_txreq_header_ahg(struct user_sdma_request *req,
  53                                struct user_sdma_txreq *tx, u32 len);
  54static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
  55                                  struct hfi1_user_sdma_comp_q *cq,
  56                                  u16 idx, enum hfi1_sdma_comp_state state,
  57                                  int ret);
  58static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags);
  59static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len);
  60
  61static int defer_packet_queue(
  62        struct sdma_engine *sde,
  63        struct iowait_work *wait,
  64        struct sdma_txreq *txreq,
  65        uint seq,
  66        bool pkts_sent);
  67static void activate_packet_queue(struct iowait *wait, int reason);
  68static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
  69                           unsigned long len);
  70static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode);
  71static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode,
  72                         void *arg2, bool *stop);
  73static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode);
  74static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode);
  75
  76static struct mmu_rb_ops sdma_rb_ops = {
  77        .filter = sdma_rb_filter,
  78        .insert = sdma_rb_insert,
  79        .evict = sdma_rb_evict,
  80        .remove = sdma_rb_remove,
  81        .invalidate = sdma_rb_invalidate
  82};
  83
  84static int defer_packet_queue(
  85        struct sdma_engine *sde,
  86        struct iowait_work *wait,
  87        struct sdma_txreq *txreq,
  88        uint seq,
  89        bool pkts_sent)
  90{
  91        struct hfi1_user_sdma_pkt_q *pq =
  92                container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy);
  93
  94        write_seqlock(&sde->waitlock);
  95        trace_hfi1_usdma_defer(pq, sde, &pq->busy);
  96        if (sdma_progress(sde, seq, txreq))
  97                goto eagain;
  98        /*
  99         * We are assuming that if the list is enqueued somewhere, it
 100         * is to the dmawait list since that is the only place where
 101         * it is supposed to be enqueued.
 102         */
 103        xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
 104        if (list_empty(&pq->busy.list)) {
 105                pq->busy.lock = &sde->waitlock;
 106                iowait_get_priority(&pq->busy);
 107                iowait_queue(pkts_sent, &pq->busy, &sde->dmawait);
 108        }
 109        write_sequnlock(&sde->waitlock);
 110        return -EBUSY;
 111eagain:
 112        write_sequnlock(&sde->waitlock);
 113        return -EAGAIN;
 114}
 115
 116static void activate_packet_queue(struct iowait *wait, int reason)
 117{
 118        struct hfi1_user_sdma_pkt_q *pq =
 119                container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
 120
 121        trace_hfi1_usdma_activate(pq, wait, reason);
 122        xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
 123        wake_up(&wait->wait_dma);
 124};
 125
 126int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt,
 127                                struct hfi1_filedata *fd)
 128{
 129        int ret = -ENOMEM;
 130        char buf[64];
 131        struct hfi1_devdata *dd;
 132        struct hfi1_user_sdma_comp_q *cq;
 133        struct hfi1_user_sdma_pkt_q *pq;
 134
 135        if (!uctxt || !fd)
 136                return -EBADF;
 137
 138        if (!hfi1_sdma_comp_ring_size)
 139                return -EINVAL;
 140
 141        dd = uctxt->dd;
 142
 143        pq = kzalloc(sizeof(*pq), GFP_KERNEL);
 144        if (!pq)
 145                return -ENOMEM;
 146        pq->dd = dd;
 147        pq->ctxt = uctxt->ctxt;
 148        pq->subctxt = fd->subctxt;
 149        pq->n_max_reqs = hfi1_sdma_comp_ring_size;
 150        atomic_set(&pq->n_reqs, 0);
 151        init_waitqueue_head(&pq->wait);
 152        atomic_set(&pq->n_locked, 0);
 153
 154        iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue,
 155                    activate_packet_queue, NULL, NULL);
 156        pq->reqidx = 0;
 157
 158        pq->reqs = kcalloc(hfi1_sdma_comp_ring_size,
 159                           sizeof(*pq->reqs),
 160                           GFP_KERNEL);
 161        if (!pq->reqs)
 162                goto pq_reqs_nomem;
 163
 164        pq->req_in_use = bitmap_zalloc(hfi1_sdma_comp_ring_size, GFP_KERNEL);
 165        if (!pq->req_in_use)
 166                goto pq_reqs_no_in_use;
 167
 168        snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt,
 169                 fd->subctxt);
 170        pq->txreq_cache = kmem_cache_create(buf,
 171                                            sizeof(struct user_sdma_txreq),
 172                                            L1_CACHE_BYTES,
 173                                            SLAB_HWCACHE_ALIGN,
 174                                            NULL);
 175        if (!pq->txreq_cache) {
 176                dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n",
 177                           uctxt->ctxt);
 178                goto pq_txreq_nomem;
 179        }
 180
 181        cq = kzalloc(sizeof(*cq), GFP_KERNEL);
 182        if (!cq)
 183                goto cq_nomem;
 184
 185        cq->comps = vmalloc_user(PAGE_ALIGN(sizeof(*cq->comps)
 186                                 * hfi1_sdma_comp_ring_size));
 187        if (!cq->comps)
 188                goto cq_comps_nomem;
 189
 190        cq->nentries = hfi1_sdma_comp_ring_size;
 191
 192        ret = hfi1_mmu_rb_register(pq, &sdma_rb_ops, dd->pport->hfi1_wq,
 193                                   &pq->handler);
 194        if (ret) {
 195                dd_dev_err(dd, "Failed to register with MMU %d", ret);
 196                goto pq_mmu_fail;
 197        }
 198
 199        rcu_assign_pointer(fd->pq, pq);
 200        fd->cq = cq;
 201
 202        return 0;
 203
 204pq_mmu_fail:
 205        vfree(cq->comps);
 206cq_comps_nomem:
 207        kfree(cq);
 208cq_nomem:
 209        kmem_cache_destroy(pq->txreq_cache);
 210pq_txreq_nomem:
 211        bitmap_free(pq->req_in_use);
 212pq_reqs_no_in_use:
 213        kfree(pq->reqs);
 214pq_reqs_nomem:
 215        kfree(pq);
 216
 217        return ret;
 218}
 219
 220static void flush_pq_iowait(struct hfi1_user_sdma_pkt_q *pq)
 221{
 222        unsigned long flags;
 223        seqlock_t *lock = pq->busy.lock;
 224
 225        if (!lock)
 226                return;
 227        write_seqlock_irqsave(lock, flags);
 228        if (!list_empty(&pq->busy.list)) {
 229                list_del_init(&pq->busy.list);
 230                pq->busy.lock = NULL;
 231        }
 232        write_sequnlock_irqrestore(lock, flags);
 233}
 234
 235int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd,
 236                               struct hfi1_ctxtdata *uctxt)
 237{
 238        struct hfi1_user_sdma_pkt_q *pq;
 239
 240        trace_hfi1_sdma_user_free_queues(uctxt->dd, uctxt->ctxt, fd->subctxt);
 241
 242        spin_lock(&fd->pq_rcu_lock);
 243        pq = srcu_dereference_check(fd->pq, &fd->pq_srcu,
 244                                    lockdep_is_held(&fd->pq_rcu_lock));
 245        if (pq) {
 246                rcu_assign_pointer(fd->pq, NULL);
 247                spin_unlock(&fd->pq_rcu_lock);
 248                synchronize_srcu(&fd->pq_srcu);
 249                /* at this point there can be no more new requests */
 250                if (pq->handler)
 251                        hfi1_mmu_rb_unregister(pq->handler);
 252                iowait_sdma_drain(&pq->busy);
 253                /* Wait until all requests have been freed. */
 254                wait_event_interruptible(
 255                        pq->wait,
 256                        !atomic_read(&pq->n_reqs));
 257                kfree(pq->reqs);
 258                bitmap_free(pq->req_in_use);
 259                kmem_cache_destroy(pq->txreq_cache);
 260                flush_pq_iowait(pq);
 261                kfree(pq);
 262        } else {
 263                spin_unlock(&fd->pq_rcu_lock);
 264        }
 265        if (fd->cq) {
 266                vfree(fd->cq->comps);
 267                kfree(fd->cq);
 268                fd->cq = NULL;
 269        }
 270        return 0;
 271}
 272
 273static u8 dlid_to_selector(u16 dlid)
 274{
 275        static u8 mapping[256];
 276        static int initialized;
 277        static u8 next;
 278        int hash;
 279
 280        if (!initialized) {
 281                memset(mapping, 0xFF, 256);
 282                initialized = 1;
 283        }
 284
 285        hash = ((dlid >> 8) ^ dlid) & 0xFF;
 286        if (mapping[hash] == 0xFF) {
 287                mapping[hash] = next;
 288                next = (next + 1) & 0x7F;
 289        }
 290
 291        return mapping[hash];
 292}
 293
 294/**
 295 * hfi1_user_sdma_process_request() - Process and start a user sdma request
 296 * @fd: valid file descriptor
 297 * @iovec: array of io vectors to process
 298 * @dim: overall iovec array size
 299 * @count: number of io vector array entries processed
 300 */
 301int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
 302                                   struct iovec *iovec, unsigned long dim,
 303                                   unsigned long *count)
 304{
 305        int ret = 0, i;
 306        struct hfi1_ctxtdata *uctxt = fd->uctxt;
 307        struct hfi1_user_sdma_pkt_q *pq =
 308                srcu_dereference(fd->pq, &fd->pq_srcu);
 309        struct hfi1_user_sdma_comp_q *cq = fd->cq;
 310        struct hfi1_devdata *dd = pq->dd;
 311        unsigned long idx = 0;
 312        u8 pcount = initial_pkt_count;
 313        struct sdma_req_info info;
 314        struct user_sdma_request *req;
 315        u8 opcode, sc, vl;
 316        u16 pkey;
 317        u32 slid;
 318        u16 dlid;
 319        u32 selector;
 320
 321        if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) {
 322                hfi1_cdbg(
 323                   SDMA,
 324                   "[%u:%u:%u] First vector not big enough for header %lu/%lu",
 325                   dd->unit, uctxt->ctxt, fd->subctxt,
 326                   iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr));
 327                return -EINVAL;
 328        }
 329        ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info));
 330        if (ret) {
 331                hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)",
 332                          dd->unit, uctxt->ctxt, fd->subctxt, ret);
 333                return -EFAULT;
 334        }
 335
 336        trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt,
 337                                     (u16 *)&info);
 338        if (info.comp_idx >= hfi1_sdma_comp_ring_size) {
 339                hfi1_cdbg(SDMA,
 340                          "[%u:%u:%u:%u] Invalid comp index",
 341                          dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
 342                return -EINVAL;
 343        }
 344
 345        /*
 346         * Sanity check the header io vector count.  Need at least 1 vector
 347         * (header) and cannot be larger than the actual io vector count.
 348         */
 349        if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) {
 350                hfi1_cdbg(SDMA,
 351                          "[%u:%u:%u:%u] Invalid iov count %d, dim %ld",
 352                          dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx,
 353                          req_iovcnt(info.ctrl), dim);
 354                return -EINVAL;
 355        }
 356
 357        if (!info.fragsize) {
 358                hfi1_cdbg(SDMA,
 359                          "[%u:%u:%u:%u] Request does not specify fragsize",
 360                          dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
 361                return -EINVAL;
 362        }
 363
 364        /* Try to claim the request. */
 365        if (test_and_set_bit(info.comp_idx, pq->req_in_use)) {
 366                hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use",
 367                          dd->unit, uctxt->ctxt, fd->subctxt,
 368                          info.comp_idx);
 369                return -EBADSLT;
 370        }
 371        /*
 372         * All safety checks have been done and this request has been claimed.
 373         */
 374        trace_hfi1_sdma_user_process_request(dd, uctxt->ctxt, fd->subctxt,
 375                                             info.comp_idx);
 376        req = pq->reqs + info.comp_idx;
 377        req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */
 378        req->data_len  = 0;
 379        req->pq = pq;
 380        req->cq = cq;
 381        req->ahg_idx = -1;
 382        req->iov_idx = 0;
 383        req->sent = 0;
 384        req->seqnum = 0;
 385        req->seqcomp = 0;
 386        req->seqsubmitted = 0;
 387        req->tids = NULL;
 388        req->has_error = 0;
 389        INIT_LIST_HEAD(&req->txps);
 390
 391        memcpy(&req->info, &info, sizeof(info));
 392
 393        /* The request is initialized, count it */
 394        atomic_inc(&pq->n_reqs);
 395
 396        if (req_opcode(info.ctrl) == EXPECTED) {
 397                /* expected must have a TID info and at least one data vector */
 398                if (req->data_iovs < 2) {
 399                        SDMA_DBG(req,
 400                                 "Not enough vectors for expected request");
 401                        ret = -EINVAL;
 402                        goto free_req;
 403                }
 404                req->data_iovs--;
 405        }
 406
 407        if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) {
 408                SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs,
 409                         MAX_VECTORS_PER_REQ);
 410                ret = -EINVAL;
 411                goto free_req;
 412        }
 413        /* Copy the header from the user buffer */
 414        ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info),
 415                             sizeof(req->hdr));
 416        if (ret) {
 417                SDMA_DBG(req, "Failed to copy header template (%d)", ret);
 418                ret = -EFAULT;
 419                goto free_req;
 420        }
 421
 422        /* If Static rate control is not enabled, sanitize the header. */
 423        if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL))
 424                req->hdr.pbc[2] = 0;
 425
 426        /* Validate the opcode. Do not trust packets from user space blindly. */
 427        opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff;
 428        if ((opcode & USER_OPCODE_CHECK_MASK) !=
 429             USER_OPCODE_CHECK_VAL) {
 430                SDMA_DBG(req, "Invalid opcode (%d)", opcode);
 431                ret = -EINVAL;
 432                goto free_req;
 433        }
 434        /*
 435         * Validate the vl. Do not trust packets from user space blindly.
 436         * VL comes from PBC, SC comes from LRH, and the VL needs to
 437         * match the SC look up.
 438         */
 439        vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF;
 440        sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) |
 441              (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4));
 442        if (vl >= dd->pport->vls_operational ||
 443            vl != sc_to_vlt(dd, sc)) {
 444                SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl);
 445                ret = -EINVAL;
 446                goto free_req;
 447        }
 448
 449        /* Checking P_KEY for requests from user-space */
 450        pkey = (u16)be32_to_cpu(req->hdr.bth[0]);
 451        slid = be16_to_cpu(req->hdr.lrh[3]);
 452        if (egress_pkey_check(dd->pport, slid, pkey, sc, PKEY_CHECK_INVALID)) {
 453                ret = -EINVAL;
 454                goto free_req;
 455        }
 456
 457        /*
 458         * Also should check the BTH.lnh. If it says the next header is GRH then
 459         * the RXE parsing will be off and will land in the middle of the KDETH
 460         * or miss it entirely.
 461         */
 462        if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) {
 463                SDMA_DBG(req, "User tried to pass in a GRH");
 464                ret = -EINVAL;
 465                goto free_req;
 466        }
 467
 468        req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]);
 469        /*
 470         * Calculate the initial TID offset based on the values of
 471         * KDETH.OFFSET and KDETH.OM that are passed in.
 472         */
 473        req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) *
 474                (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
 475                 KDETH_OM_LARGE : KDETH_OM_SMALL);
 476        trace_hfi1_sdma_user_initial_tidoffset(dd, uctxt->ctxt, fd->subctxt,
 477                                               info.comp_idx, req->tidoffset);
 478        idx++;
 479
 480        /* Save all the IO vector structures */
 481        for (i = 0; i < req->data_iovs; i++) {
 482                req->iovs[i].offset = 0;
 483                INIT_LIST_HEAD(&req->iovs[i].list);
 484                memcpy(&req->iovs[i].iov,
 485                       iovec + idx++,
 486                       sizeof(req->iovs[i].iov));
 487                ret = pin_vector_pages(req, &req->iovs[i]);
 488                if (ret) {
 489                        req->data_iovs = i;
 490                        goto free_req;
 491                }
 492                req->data_len += req->iovs[i].iov.iov_len;
 493        }
 494        trace_hfi1_sdma_user_data_length(dd, uctxt->ctxt, fd->subctxt,
 495                                         info.comp_idx, req->data_len);
 496        if (pcount > req->info.npkts)
 497                pcount = req->info.npkts;
 498        /*
 499         * Copy any TID info
 500         * User space will provide the TID info only when the
 501         * request type is EXPECTED. This is true even if there is
 502         * only one packet in the request and the header is already
 503         * setup. The reason for the singular TID case is that the
 504         * driver needs to perform safety checks.
 505         */
 506        if (req_opcode(req->info.ctrl) == EXPECTED) {
 507                u16 ntids = iovec[idx].iov_len / sizeof(*req->tids);
 508                u32 *tmp;
 509
 510                if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) {
 511                        ret = -EINVAL;
 512                        goto free_req;
 513                }
 514
 515                /*
 516                 * We have to copy all of the tids because they may vary
 517                 * in size and, therefore, the TID count might not be
 518                 * equal to the pkt count. However, there is no way to
 519                 * tell at this point.
 520                 */
 521                tmp = memdup_user(iovec[idx].iov_base,
 522                                  ntids * sizeof(*req->tids));
 523                if (IS_ERR(tmp)) {
 524                        ret = PTR_ERR(tmp);
 525                        SDMA_DBG(req, "Failed to copy %d TIDs (%d)",
 526                                 ntids, ret);
 527                        goto free_req;
 528                }
 529                req->tids = tmp;
 530                req->n_tids = ntids;
 531                req->tididx = 0;
 532                idx++;
 533        }
 534
 535        dlid = be16_to_cpu(req->hdr.lrh[1]);
 536        selector = dlid_to_selector(dlid);
 537        selector += uctxt->ctxt + fd->subctxt;
 538        req->sde = sdma_select_user_engine(dd, selector, vl);
 539
 540        if (!req->sde || !sdma_running(req->sde)) {
 541                ret = -ECOMM;
 542                goto free_req;
 543        }
 544
 545        /* We don't need an AHG entry if the request contains only one packet */
 546        if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG))
 547                req->ahg_idx = sdma_ahg_alloc(req->sde);
 548
 549        set_comp_state(pq, cq, info.comp_idx, QUEUED, 0);
 550        pq->state = SDMA_PKT_Q_ACTIVE;
 551
 552        /*
 553         * This is a somewhat blocking send implementation.
 554         * The driver will block the caller until all packets of the
 555         * request have been submitted to the SDMA engine. However, it
 556         * will not wait for send completions.
 557         */
 558        while (req->seqsubmitted != req->info.npkts) {
 559                ret = user_sdma_send_pkts(req, pcount);
 560                if (ret < 0) {
 561                        int we_ret;
 562
 563                        if (ret != -EBUSY)
 564                                goto free_req;
 565                        we_ret = wait_event_interruptible_timeout(
 566                                pq->busy.wait_dma,
 567                                pq->state == SDMA_PKT_Q_ACTIVE,
 568                                msecs_to_jiffies(
 569                                        SDMA_IOWAIT_TIMEOUT));
 570                        trace_hfi1_usdma_we(pq, we_ret);
 571                        if (we_ret <= 0)
 572                                flush_pq_iowait(pq);
 573                }
 574        }
 575        *count += idx;
 576        return 0;
 577free_req:
 578        /*
 579         * If the submitted seqsubmitted == npkts, the completion routine
 580         * controls the final state.  If sequbmitted < npkts, wait for any
 581         * outstanding packets to finish before cleaning up.
 582         */
 583        if (req->seqsubmitted < req->info.npkts) {
 584                if (req->seqsubmitted)
 585                        wait_event(pq->busy.wait_dma,
 586                                   (req->seqcomp == req->seqsubmitted - 1));
 587                user_sdma_free_request(req, true);
 588                pq_update(pq);
 589                set_comp_state(pq, cq, info.comp_idx, ERROR, ret);
 590        }
 591        return ret;
 592}
 593
 594static inline u32 compute_data_length(struct user_sdma_request *req,
 595                                      struct user_sdma_txreq *tx)
 596{
 597        /*
 598         * Determine the proper size of the packet data.
 599         * The size of the data of the first packet is in the header
 600         * template. However, it includes the header and ICRC, which need
 601         * to be subtracted.
 602         * The minimum representable packet data length in a header is 4 bytes,
 603         * therefore, when the data length request is less than 4 bytes, there's
 604         * only one packet, and the packet data length is equal to that of the
 605         * request data length.
 606         * The size of the remaining packets is the minimum of the frag
 607         * size (MTU) or remaining data in the request.
 608         */
 609        u32 len;
 610
 611        if (!req->seqnum) {
 612                if (req->data_len < sizeof(u32))
 613                        len = req->data_len;
 614                else
 615                        len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) -
 616                               (sizeof(tx->hdr) - 4));
 617        } else if (req_opcode(req->info.ctrl) == EXPECTED) {
 618                u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) *
 619                        PAGE_SIZE;
 620                /*
 621                 * Get the data length based on the remaining space in the
 622                 * TID pair.
 623                 */
 624                len = min(tidlen - req->tidoffset, (u32)req->info.fragsize);
 625                /* If we've filled up the TID pair, move to the next one. */
 626                if (unlikely(!len) && ++req->tididx < req->n_tids &&
 627                    req->tids[req->tididx]) {
 628                        tidlen = EXP_TID_GET(req->tids[req->tididx],
 629                                             LEN) * PAGE_SIZE;
 630                        req->tidoffset = 0;
 631                        len = min_t(u32, tidlen, req->info.fragsize);
 632                }
 633                /*
 634                 * Since the TID pairs map entire pages, make sure that we
 635                 * are not going to try to send more data that we have
 636                 * remaining.
 637                 */
 638                len = min(len, req->data_len - req->sent);
 639        } else {
 640                len = min(req->data_len - req->sent, (u32)req->info.fragsize);
 641        }
 642        trace_hfi1_sdma_user_compute_length(req->pq->dd,
 643                                            req->pq->ctxt,
 644                                            req->pq->subctxt,
 645                                            req->info.comp_idx,
 646                                            len);
 647        return len;
 648}
 649
 650static inline u32 pad_len(u32 len)
 651{
 652        if (len & (sizeof(u32) - 1))
 653                len += sizeof(u32) - (len & (sizeof(u32) - 1));
 654        return len;
 655}
 656
 657static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len)
 658{
 659        /* (Size of complete header - size of PBC) + 4B ICRC + data length */
 660        return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len);
 661}
 662
 663static int user_sdma_txadd_ahg(struct user_sdma_request *req,
 664                               struct user_sdma_txreq *tx,
 665                               u32 datalen)
 666{
 667        int ret;
 668        u16 pbclen = le16_to_cpu(req->hdr.pbc[0]);
 669        u32 lrhlen = get_lrh_len(req->hdr, pad_len(datalen));
 670        struct hfi1_user_sdma_pkt_q *pq = req->pq;
 671
 672        /*
 673         * Copy the request header into the tx header
 674         * because the HW needs a cacheline-aligned
 675         * address.
 676         * This copy can be optimized out if the hdr
 677         * member of user_sdma_request were also
 678         * cacheline aligned.
 679         */
 680        memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr));
 681        if (PBC2LRH(pbclen) != lrhlen) {
 682                pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
 683                tx->hdr.pbc[0] = cpu_to_le16(pbclen);
 684        }
 685        ret = check_header_template(req, &tx->hdr, lrhlen, datalen);
 686        if (ret)
 687                return ret;
 688        ret = sdma_txinit_ahg(&tx->txreq, SDMA_TXREQ_F_AHG_COPY,
 689                              sizeof(tx->hdr) + datalen, req->ahg_idx,
 690                              0, NULL, 0, user_sdma_txreq_cb);
 691        if (ret)
 692                return ret;
 693        ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, &tx->hdr, sizeof(tx->hdr));
 694        if (ret)
 695                sdma_txclean(pq->dd, &tx->txreq);
 696        return ret;
 697}
 698
 699static int user_sdma_txadd(struct user_sdma_request *req,
 700                           struct user_sdma_txreq *tx,
 701                           struct user_sdma_iovec *iovec, u32 datalen,
 702                           u32 *queued_ptr, u32 *data_sent_ptr,
 703                           u64 *iov_offset_ptr)
 704{
 705        int ret;
 706        unsigned int pageidx, len;
 707        unsigned long base, offset;
 708        u64 iov_offset = *iov_offset_ptr;
 709        u32 queued = *queued_ptr, data_sent = *data_sent_ptr;
 710        struct hfi1_user_sdma_pkt_q *pq = req->pq;
 711
 712        base = (unsigned long)iovec->iov.iov_base;
 713        offset = offset_in_page(base + iovec->offset + iov_offset);
 714        pageidx = (((iovec->offset + iov_offset + base) - (base & PAGE_MASK)) >>
 715                   PAGE_SHIFT);
 716        len = offset + req->info.fragsize > PAGE_SIZE ?
 717                PAGE_SIZE - offset : req->info.fragsize;
 718        len = min((datalen - queued), len);
 719        ret = sdma_txadd_page(pq->dd, &tx->txreq, iovec->pages[pageidx],
 720                              offset, len);
 721        if (ret) {
 722                SDMA_DBG(req, "SDMA txreq add page failed %d\n", ret);
 723                return ret;
 724        }
 725        iov_offset += len;
 726        queued += len;
 727        data_sent += len;
 728        if (unlikely(queued < datalen && pageidx == iovec->npages &&
 729                     req->iov_idx < req->data_iovs - 1)) {
 730                iovec->offset += iov_offset;
 731                iovec = &req->iovs[++req->iov_idx];
 732                iov_offset = 0;
 733        }
 734
 735        *queued_ptr = queued;
 736        *data_sent_ptr = data_sent;
 737        *iov_offset_ptr = iov_offset;
 738        return ret;
 739}
 740
 741static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts)
 742{
 743        int ret = 0;
 744        u16 count;
 745        unsigned npkts = 0;
 746        struct user_sdma_txreq *tx = NULL;
 747        struct hfi1_user_sdma_pkt_q *pq = NULL;
 748        struct user_sdma_iovec *iovec = NULL;
 749
 750        if (!req->pq)
 751                return -EINVAL;
 752
 753        pq = req->pq;
 754
 755        /* If tx completion has reported an error, we are done. */
 756        if (READ_ONCE(req->has_error))
 757                return -EFAULT;
 758
 759        /*
 760         * Check if we might have sent the entire request already
 761         */
 762        if (unlikely(req->seqnum == req->info.npkts)) {
 763                if (!list_empty(&req->txps))
 764                        goto dosend;
 765                return ret;
 766        }
 767
 768        if (!maxpkts || maxpkts > req->info.npkts - req->seqnum)
 769                maxpkts = req->info.npkts - req->seqnum;
 770
 771        while (npkts < maxpkts) {
 772                u32 datalen = 0, queued = 0, data_sent = 0;
 773                u64 iov_offset = 0;
 774
 775                /*
 776                 * Check whether any of the completions have come back
 777                 * with errors. If so, we are not going to process any
 778                 * more packets from this request.
 779                 */
 780                if (READ_ONCE(req->has_error))
 781                        return -EFAULT;
 782
 783                tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL);
 784                if (!tx)
 785                        return -ENOMEM;
 786
 787                tx->flags = 0;
 788                tx->req = req;
 789                INIT_LIST_HEAD(&tx->list);
 790
 791                /*
 792                 * For the last packet set the ACK request
 793                 * and disable header suppression.
 794                 */
 795                if (req->seqnum == req->info.npkts - 1)
 796                        tx->flags |= (TXREQ_FLAGS_REQ_ACK |
 797                                      TXREQ_FLAGS_REQ_DISABLE_SH);
 798
 799                /*
 800                 * Calculate the payload size - this is min of the fragment
 801                 * (MTU) size or the remaining bytes in the request but only
 802                 * if we have payload data.
 803                 */
 804                if (req->data_len) {
 805                        iovec = &req->iovs[req->iov_idx];
 806                        if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) {
 807                                if (++req->iov_idx == req->data_iovs) {
 808                                        ret = -EFAULT;
 809                                        goto free_tx;
 810                                }
 811                                iovec = &req->iovs[req->iov_idx];
 812                                WARN_ON(iovec->offset);
 813                        }
 814
 815                        datalen = compute_data_length(req, tx);
 816
 817                        /*
 818                         * Disable header suppression for the payload <= 8DWS.
 819                         * If there is an uncorrectable error in the receive
 820                         * data FIFO when the received payload size is less than
 821                         * or equal to 8DWS then the RxDmaDataFifoRdUncErr is
 822                         * not reported.There is set RHF.EccErr if the header
 823                         * is not suppressed.
 824                         */
 825                        if (!datalen) {
 826                                SDMA_DBG(req,
 827                                         "Request has data but pkt len is 0");
 828                                ret = -EFAULT;
 829                                goto free_tx;
 830                        } else if (datalen <= 32) {
 831                                tx->flags |= TXREQ_FLAGS_REQ_DISABLE_SH;
 832                        }
 833                }
 834
 835                if (req->ahg_idx >= 0) {
 836                        if (!req->seqnum) {
 837                                ret = user_sdma_txadd_ahg(req, tx, datalen);
 838                                if (ret)
 839                                        goto free_tx;
 840                        } else {
 841                                int changes;
 842
 843                                changes = set_txreq_header_ahg(req, tx,
 844                                                               datalen);
 845                                if (changes < 0) {
 846                                        ret = changes;
 847                                        goto free_tx;
 848                                }
 849                        }
 850                } else {
 851                        ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) +
 852                                          datalen, user_sdma_txreq_cb);
 853                        if (ret)
 854                                goto free_tx;
 855                        /*
 856                         * Modify the header for this packet. This only needs
 857                         * to be done if we are not going to use AHG. Otherwise,
 858                         * the HW will do it based on the changes we gave it
 859                         * during sdma_txinit_ahg().
 860                         */
 861                        ret = set_txreq_header(req, tx, datalen);
 862                        if (ret)
 863                                goto free_txreq;
 864                }
 865
 866                /*
 867                 * If the request contains any data vectors, add up to
 868                 * fragsize bytes to the descriptor.
 869                 */
 870                while (queued < datalen &&
 871                       (req->sent + data_sent) < req->data_len) {
 872                        ret = user_sdma_txadd(req, tx, iovec, datalen,
 873                                              &queued, &data_sent, &iov_offset);
 874                        if (ret)
 875                                goto free_txreq;
 876                }
 877                /*
 878                 * The txreq was submitted successfully so we can update
 879                 * the counters.
 880                 */
 881                req->koffset += datalen;
 882                if (req_opcode(req->info.ctrl) == EXPECTED)
 883                        req->tidoffset += datalen;
 884                req->sent += data_sent;
 885                if (req->data_len)
 886                        iovec->offset += iov_offset;
 887                list_add_tail(&tx->txreq.list, &req->txps);
 888                /*
 889                 * It is important to increment this here as it is used to
 890                 * generate the BTH.PSN and, therefore, can't be bulk-updated
 891                 * outside of the loop.
 892                 */
 893                tx->seqnum = req->seqnum++;
 894                npkts++;
 895        }
 896dosend:
 897        ret = sdma_send_txlist(req->sde,
 898                               iowait_get_ib_work(&pq->busy),
 899                               &req->txps, &count);
 900        req->seqsubmitted += count;
 901        if (req->seqsubmitted == req->info.npkts) {
 902                /*
 903                 * The txreq has already been submitted to the HW queue
 904                 * so we can free the AHG entry now. Corruption will not
 905                 * happen due to the sequential manner in which
 906                 * descriptors are processed.
 907                 */
 908                if (req->ahg_idx >= 0)
 909                        sdma_ahg_free(req->sde, req->ahg_idx);
 910        }
 911        return ret;
 912
 913free_txreq:
 914        sdma_txclean(pq->dd, &tx->txreq);
 915free_tx:
 916        kmem_cache_free(pq->txreq_cache, tx);
 917        return ret;
 918}
 919
 920static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages)
 921{
 922        struct evict_data evict_data;
 923
 924        evict_data.cleared = 0;
 925        evict_data.target = npages;
 926        hfi1_mmu_rb_evict(pq->handler, &evict_data);
 927        return evict_data.cleared;
 928}
 929
 930static int pin_sdma_pages(struct user_sdma_request *req,
 931                          struct user_sdma_iovec *iovec,
 932                          struct sdma_mmu_node *node,
 933                          int npages)
 934{
 935        int pinned, cleared;
 936        struct page **pages;
 937        struct hfi1_user_sdma_pkt_q *pq = req->pq;
 938
 939        pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
 940        if (!pages)
 941                return -ENOMEM;
 942        memcpy(pages, node->pages, node->npages * sizeof(*pages));
 943
 944        npages -= node->npages;
 945retry:
 946        if (!hfi1_can_pin_pages(pq->dd, current->mm,
 947                                atomic_read(&pq->n_locked), npages)) {
 948                cleared = sdma_cache_evict(pq, npages);
 949                if (cleared >= npages)
 950                        goto retry;
 951        }
 952        pinned = hfi1_acquire_user_pages(current->mm,
 953                                         ((unsigned long)iovec->iov.iov_base +
 954                                         (node->npages * PAGE_SIZE)), npages, 0,
 955                                         pages + node->npages);
 956        if (pinned < 0) {
 957                kfree(pages);
 958                return pinned;
 959        }
 960        if (pinned != npages) {
 961                unpin_vector_pages(current->mm, pages, node->npages, pinned);
 962                return -EFAULT;
 963        }
 964        kfree(node->pages);
 965        node->rb.len = iovec->iov.iov_len;
 966        node->pages = pages;
 967        atomic_add(pinned, &pq->n_locked);
 968        return pinned;
 969}
 970
 971static void unpin_sdma_pages(struct sdma_mmu_node *node)
 972{
 973        if (node->npages) {
 974                unpin_vector_pages(mm_from_sdma_node(node), node->pages, 0,
 975                                   node->npages);
 976                atomic_sub(node->npages, &node->pq->n_locked);
 977        }
 978}
 979
 980static int pin_vector_pages(struct user_sdma_request *req,
 981                            struct user_sdma_iovec *iovec)
 982{
 983        int ret = 0, pinned, npages;
 984        struct hfi1_user_sdma_pkt_q *pq = req->pq;
 985        struct sdma_mmu_node *node = NULL;
 986        struct mmu_rb_node *rb_node;
 987        struct iovec *iov;
 988        bool extracted;
 989
 990        extracted =
 991                hfi1_mmu_rb_remove_unless_exact(pq->handler,
 992                                                (unsigned long)
 993                                                iovec->iov.iov_base,
 994                                                iovec->iov.iov_len, &rb_node);
 995        if (rb_node) {
 996                node = container_of(rb_node, struct sdma_mmu_node, rb);
 997                if (!extracted) {
 998                        atomic_inc(&node->refcount);
 999                        iovec->pages = node->pages;
1000                        iovec->npages = node->npages;

1001                        iovec->node = node;
1002                        return 0;
1003                }
1004        }
1005
1006        if (!node) {
1007                node = kzalloc(sizeof(*node), GFP_KERNEL);
1008                if (!node)
1009                        return -ENOMEM;
1010
1011                node->rb.addr = (unsigned long)iovec->iov.iov_base;
1012                node->pq = pq;
1013                atomic_set(&node->refcount, 0);
1014        }
1015
1016        iov = &iovec->iov;
1017        npages = num_user_pages((unsigned long)iov->iov_base, iov->iov_len);
1018        if (node->npages < npages) {
1019                pinned = pin_sdma_pages(req, iovec, node, npages);
1020                if (pinned < 0) {
1021                        ret = pinned;
1022                        goto bail;
1023                }
1024                node->npages += pinned;
1025                npages = node->npages;
1026        }
1027        iovec->pages = node->pages;
1028        iovec->npages = npages;
1029        iovec->node = node;
1030
1031        ret = hfi1_mmu_rb_insert(req->pq->handler, &node->rb);
1032        if (ret) {
1033                iovec->node = NULL;
1034                goto bail;
1035        }
1036        return 0;
1037bail:
1038        unpin_sdma_pages(node);
1039        kfree(node);
1040        return ret;
1041}
1042
1043static void unpin_vector_pages(struct mm_struct *mm, struct page **pages,
1044                               unsigned start, unsigned npages)
1045{
1046        hfi1_release_user_pages(mm, pages + start, npages, false);
1047        kfree(pages);
1048}
1049
1050static int check_header_template(struct user_sdma_request *req,
1051                                 struct hfi1_pkt_header *hdr, u32 lrhlen,
1052                                 u32 datalen)
1053{
1054        /*
1055         * Perform safety checks for any type of packet:
1056         *    - transfer size is multiple of 64bytes
1057         *    - packet length is multiple of 4 bytes
1058         *    - packet length is not larger than MTU size
1059         *
1060         * These checks are only done for the first packet of the
1061         * transfer since the header is "given" to us by user space.
1062         * For the remainder of the packets we compute the values.
1063         */
1064        if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 ||
1065            lrhlen > get_lrh_len(*hdr, req->info.fragsize))
1066                return -EINVAL;
1067
1068        if (req_opcode(req->info.ctrl) == EXPECTED) {
1069                /*
1070                 * The header is checked only on the first packet. Furthermore,
1071                 * we ensure that at least one TID entry is copied when the
1072                 * request is submitted. Therefore, we don't have to verify that
1073                 * tididx points to something sane.
1074                 */
1075                u32 tidval = req->tids[req->tididx],
1076                        tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE,
1077                        tididx = EXP_TID_GET(tidval, IDX),
1078                        tidctrl = EXP_TID_GET(tidval, CTRL),
1079                        tidoff;
1080                __le32 kval = hdr->kdeth.ver_tid_offset;
1081
1082                tidoff = KDETH_GET(kval, OFFSET) *
1083                          (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
1084                           KDETH_OM_LARGE : KDETH_OM_SMALL);
1085                /*
1086                 * Expected receive packets have the following
1087                 * additional checks:
1088                 *     - offset is not larger than the TID size
1089                 *     - TIDCtrl values match between header and TID array
1090                 *     - TID indexes match between header and TID array
1091                 */
1092                if ((tidoff + datalen > tidlen) ||
1093                    KDETH_GET(kval, TIDCTRL) != tidctrl ||
1094                    KDETH_GET(kval, TID) != tididx)
1095                        return -EINVAL;
1096        }
1097        return 0;
1098}
1099
1100/*
1101 * Correctly set the BTH.PSN field based on type of
1102 * transfer - eager packets can just increment the PSN but
1103 * expected packets encode generation and sequence in the
1104 * BTH.PSN field so just incrementing will result in errors.
1105 */
1106static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags)
1107{
1108        u32 val = be32_to_cpu(bthpsn),
1109                mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull :
1110                        0xffffffull),
1111                psn = val & mask;
1112        if (expct)
1113                psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) |
1114                        ((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK);
1115        else
1116                psn = psn + frags;
1117        return psn & mask;
1118}
1119
1120static int set_txreq_header(struct user_sdma_request *req,
1121                            struct user_sdma_txreq *tx, u32 datalen)
1122{
1123        struct hfi1_user_sdma_pkt_q *pq = req->pq;
1124        struct hfi1_pkt_header *hdr = &tx->hdr;
1125        u8 omfactor; /* KDETH.OM */
1126        u16 pbclen;
1127        int ret;
1128        u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
1129
1130        /* Copy the header template to the request before modification */
1131        memcpy(hdr, &req->hdr, sizeof(*hdr));
1132
1133        /*
1134         * Check if the PBC and LRH length are mismatched. If so
1135         * adjust both in the header.
1136         */
1137        pbclen = le16_to_cpu(hdr->pbc[0]);
1138        if (PBC2LRH(pbclen) != lrhlen) {
1139                pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
1140                hdr->pbc[0] = cpu_to_le16(pbclen);
1141                hdr->lrh[2] = cpu_to_be16(lrhlen >> 2);
1142                /*
1143                 * Third packet
1144                 * This is the first packet in the sequence that has
1145                 * a "static" size that can be used for the rest of
1146                 * the packets (besides the last one).
1147                 */
1148                if (unlikely(req->seqnum == 2)) {
1149                        /*
1150                         * From this point on the lengths in both the
1151                         * PBC and LRH are the same until the last
1152                         * packet.
1153                         * Adjust the template so we don't have to update
1154                         * every packet
1155                         */
1156                        req->hdr.pbc[0] = hdr->pbc[0];
1157                        req->hdr.lrh[2] = hdr->lrh[2];
1158                }
1159        }
1160        /*
1161         * We only have to modify the header if this is not the
1162         * first packet in the request. Otherwise, we use the
1163         * header given to us.
1164         */
1165        if (unlikely(!req->seqnum)) {
1166                ret = check_header_template(req, hdr, lrhlen, datalen);
1167                if (ret)
1168                        return ret;
1169                goto done;
1170        }
1171
1172        hdr->bth[2] = cpu_to_be32(
1173                set_pkt_bth_psn(hdr->bth[2],
1174                                (req_opcode(req->info.ctrl) == EXPECTED),
1175                                req->seqnum));
1176
1177        /* Set ACK request on last packet */
1178        if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
1179                hdr->bth[2] |= cpu_to_be32(1UL << 31);
1180
1181        /* Set the new offset */
1182        hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset);
1183        /* Expected packets have to fill in the new TID information */
1184        if (req_opcode(req->info.ctrl) == EXPECTED) {
1185                tidval = req->tids[req->tididx];
1186                /*
1187                 * If the offset puts us at the end of the current TID,
1188                 * advance everything.
1189                 */
1190                if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
1191                                         PAGE_SIZE)) {
1192                        req->tidoffset = 0;
1193                        /*
1194                         * Since we don't copy all the TIDs, all at once,
1195                         * we have to check again.
1196                         */
1197                        if (++req->tididx > req->n_tids - 1 ||
1198                            !req->tids[req->tididx]) {
1199                                return -EINVAL;
1200                        }
1201                        tidval = req->tids[req->tididx];
1202                }
1203                omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >=
1204                        KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE_SHIFT :
1205                        KDETH_OM_SMALL_SHIFT;
1206                /* Set KDETH.TIDCtrl based on value for this TID. */
1207                KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL,
1208                          EXP_TID_GET(tidval, CTRL));
1209                /* Set KDETH.TID based on value for this TID */
1210                KDETH_SET(hdr->kdeth.ver_tid_offset, TID,
1211                          EXP_TID_GET(tidval, IDX));
1212                /* Clear KDETH.SH when DISABLE_SH flag is set */
1213                if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH))
1214                        KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0);
1215                /*
1216                 * Set the KDETH.OFFSET and KDETH.OM based on size of
1217                 * transfer.
1218                 */
1219                trace_hfi1_sdma_user_tid_info(
1220                        pq->dd, pq->ctxt, pq->subctxt, req->info.comp_idx,
1221                        req->tidoffset, req->tidoffset >> omfactor,
1222                        omfactor != KDETH_OM_SMALL_SHIFT);
1223                KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET,
1224                          req->tidoffset >> omfactor);
1225                KDETH_SET(hdr->kdeth.ver_tid_offset, OM,
1226                          omfactor != KDETH_OM_SMALL_SHIFT);
1227        }
1228done:
1229        trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt,
1230                                    req->info.comp_idx, hdr, tidval);
1231        return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr));
1232}
1233
1234static int set_txreq_header_ahg(struct user_sdma_request *req,
1235                                struct user_sdma_txreq *tx, u32 datalen)
1236{
1237        u32 ahg[AHG_KDETH_ARRAY_SIZE];
1238        int idx = 0;
1239        u8 omfactor; /* KDETH.OM */
1240        struct hfi1_user_sdma_pkt_q *pq = req->pq;
1241        struct hfi1_pkt_header *hdr = &req->hdr;
1242        u16 pbclen = le16_to_cpu(hdr->pbc[0]);
1243        u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
1244        size_t array_size = ARRAY_SIZE(ahg);
1245
1246        if (PBC2LRH(pbclen) != lrhlen) {
1247                /* PBC.PbcLengthDWs */
1248                idx = ahg_header_set(ahg, idx, array_size, 0, 0, 12,
1249                                     (__force u16)cpu_to_le16(LRH2PBC(lrhlen)));
1250                if (idx < 0)
1251                        return idx;
1252                /* LRH.PktLen (we need the full 16 bits due to byte swap) */
1253                idx = ahg_header_set(ahg, idx, array_size, 3, 0, 16,
1254                                     (__force u16)cpu_to_be16(lrhlen >> 2));
1255                if (idx < 0)
1256                        return idx;
1257        }
1258
1259        /*
1260         * Do the common updates
1261         */
1262        /* BTH.PSN and BTH.A */
1263        val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) &
1264                (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff);
1265        if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
1266                val32 |= 1UL << 31;
1267        idx = ahg_header_set(ahg, idx, array_size, 6, 0, 16,
1268                             (__force u16)cpu_to_be16(val32 >> 16));
1269        if (idx < 0)
1270                return idx;
1271        idx = ahg_header_set(ahg, idx, array_size, 6, 16, 16,
1272                             (__force u16)cpu_to_be16(val32 & 0xffff));
1273        if (idx < 0)
1274                return idx;
1275        /* KDETH.Offset */
1276        idx = ahg_header_set(ahg, idx, array_size, 15, 0, 16,
1277                             (__force u16)cpu_to_le16(req->koffset & 0xffff));
1278        if (idx < 0)
1279                return idx;
1280        idx = ahg_header_set(ahg, idx, array_size, 15, 16, 16,
1281                             (__force u16)cpu_to_le16(req->koffset >> 16));
1282        if (idx < 0)
1283                return idx;
1284        if (req_opcode(req->info.ctrl) == EXPECTED) {
1285                __le16 val;
1286
1287                tidval = req->tids[req->tididx];
1288
1289                /*
1290                 * If the offset puts us at the end of the current TID,
1291                 * advance everything.
1292                 */
1293                if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
1294                                         PAGE_SIZE)) {
1295                        req->tidoffset = 0;
1296                        /*
1297                         * Since we don't copy all the TIDs, all at once,
1298                         * we have to check again.
1299                         */
1300                        if (++req->tididx > req->n_tids - 1 ||
1301                            !req->tids[req->tididx])
1302                                return -EINVAL;
1303                        tidval = req->tids[req->tididx];
1304                }
1305                omfactor = ((EXP_TID_GET(tidval, LEN) *
1306                                  PAGE_SIZE) >=
1307                                 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT :
1308                                 KDETH_OM_SMALL_SHIFT;
1309                /* KDETH.OM and KDETH.OFFSET (TID) */
1310                idx = ahg_header_set(
1311                                ahg, idx, array_size, 7, 0, 16,
1312                                ((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 |
1313                                ((req->tidoffset >> omfactor)
1314                                & 0x7fff)));
1315                if (idx < 0)
1316                        return idx;
1317                /* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */
1318                val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) |
1319                                   (EXP_TID_GET(tidval, IDX) & 0x3ff));
1320
1321                if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) {
1322                        val |= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
1323                                                      INTR) <<
1324                                            AHG_KDETH_INTR_SHIFT));
1325                } else {
1326                        val |= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ?
1327                               cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) :
1328                               cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
1329                                                      INTR) <<
1330                                             AHG_KDETH_INTR_SHIFT));
1331                }
1332
1333                idx = ahg_header_set(ahg, idx, array_size,
1334                                     7, 16, 14, (__force u16)val);
1335                if (idx < 0)
1336                        return idx;
1337        }
1338
1339        trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt,
1340                                        req->info.comp_idx, req->sde->this_idx,
1341                                        req->ahg_idx, ahg, idx, tidval);
1342        sdma_txinit_ahg(&tx->txreq,
1343                        SDMA_TXREQ_F_USE_AHG,
1344                        datalen, req->ahg_idx, idx,
1345                        ahg, sizeof(req->hdr),
1346                        user_sdma_txreq_cb);
1347
1348        return idx;
1349}
1350
1351/**
1352 * user_sdma_txreq_cb() - SDMA tx request completion callback.
1353 * @txreq: valid sdma tx request
1354 * @status: success/failure of request
1355 *
1356 * Called when the SDMA progress state machine gets notification that
1357 * the SDMA descriptors for this tx request have been processed by the
1358 * DMA engine. Called in interrupt context.
1359 * Only do work on completed sequences.
1360 */
1361static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
1362{
1363        struct user_sdma_txreq *tx =
1364                container_of(txreq, struct user_sdma_txreq, txreq);
1365        struct user_sdma_request *req;
1366        struct hfi1_user_sdma_pkt_q *pq;
1367        struct hfi1_user_sdma_comp_q *cq;
1368        enum hfi1_sdma_comp_state state = COMPLETE;
1369
1370        if (!tx->req)
1371                return;
1372
1373        req = tx->req;
1374        pq = req->pq;
1375        cq = req->cq;
1376
1377        if (status != SDMA_TXREQ_S_OK) {
1378                SDMA_DBG(req, "SDMA completion with error %d",
1379                         status);
1380                WRITE_ONCE(req->has_error, 1);
1381                state = ERROR;
1382        }
1383
1384        req->seqcomp = tx->seqnum;
1385        kmem_cache_free(pq->txreq_cache, tx);
1386
1387        /* sequence isn't complete?  We are done */
1388        if (req->seqcomp != req->info.npkts - 1)
1389                return;
1390
1391        user_sdma_free_request(req, false);
1392        set_comp_state(pq, cq, req->info.comp_idx, state, status);
1393        pq_update(pq);
1394}
1395
1396static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq)
1397{
1398        if (atomic_dec_and_test(&pq->n_reqs))
1399                wake_up(&pq->wait);
1400}
1401
1402static void user_sdma_free_request(struct user_sdma_request *req, bool unpin)
1403{
1404        int i;
1405
1406        if (!list_empty(&req->txps)) {
1407                struct sdma_txreq *t, *p;
1408
1409                list_for_each_entry_safe(t, p, &req->txps, list) {
1410                        struct user_sdma_txreq *tx =
1411                                container_of(t, struct user_sdma_txreq, txreq);
1412                        list_del_init(&t->list);
1413                        sdma_txclean(req->pq->dd, t);
1414                        kmem_cache_free(req->pq->txreq_cache, tx);
1415                }
1416        }
1417
1418        for (i = 0; i < req->data_iovs; i++) {
1419                struct sdma_mmu_node *node = req->iovs[i].node;
1420
1421                if (!node)
1422                        continue;
1423
1424                req->iovs[i].node = NULL;
1425
1426                if (unpin)
1427                        hfi1_mmu_rb_remove(req->pq->handler,
1428                                           &node->rb);
1429                else
1430                        atomic_dec(&node->refcount);
1431        }
1432
1433        kfree(req->tids);
1434        clear_bit(req->info.comp_idx, req->pq->req_in_use);
1435}
1436
1437static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
1438                                  struct hfi1_user_sdma_comp_q *cq,
1439                                  u16 idx, enum hfi1_sdma_comp_state state,
1440                                  int ret)
1441{
1442        if (state == ERROR)
1443                cq->comps[idx].errcode = -ret;
1444        smp_wmb(); /* make sure errcode is visible first */
1445        cq->comps[idx].status = state;
1446        trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt,
1447                                        idx, state, ret);
1448}
1449
1450static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
1451                           unsigned long len)
1452{
1453        return (bool)(node->addr == addr);
1454}
1455
1456static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode)
1457{
1458        struct sdma_mmu_node *node =
1459                container_of(mnode, struct sdma_mmu_node, rb);
1460
1461        atomic_inc(&node->refcount);
1462        return 0;
1463}
1464
1465/*
1466 * Return 1 to remove the node from the rb tree and call the remove op.
1467 *
1468 * Called with the rb tree lock held.
1469 */
1470static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode,
1471                         void *evict_arg, bool *stop)
1472{
1473        struct sdma_mmu_node *node =
1474                container_of(mnode, struct sdma_mmu_node, rb);
1475        struct evict_data *evict_data = evict_arg;
1476
1477        /* is this node still being used? */
1478        if (atomic_read(&node->refcount))
1479                return 0; /* keep this node */
1480
1481        /* this node will be evicted, add its pages to our count */
1482        evict_data->cleared += node->npages;
1483
1484        /* have enough pages been cleared? */
1485        if (evict_data->cleared >= evict_data->target)
1486                *stop = true;
1487
1488        return 1; /* remove this node */
1489}
1490
1491static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode)
1492{
1493        struct sdma_mmu_node *node =
1494                container_of(mnode, struct sdma_mmu_node, rb);
1495
1496        unpin_sdma_pages(node);
1497        kfree(node);
1498}
1499
1500static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode)
1501{
1502        struct sdma_mmu_node *node =
1503                container_of(mnode, struct sdma_mmu_node, rb);
1504
1505        if (!atomic_read(&node->refcount))
1506                return 1;
1507        return 0;
1508}
1509