linux/drivers/infiniband/hw/hfi1/user_exp_rcv.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
   2/*
   3 * Copyright(c) 2020 Cornelis Networks, Inc.
   4 * Copyright(c) 2015-2018 Intel Corporation.
   5 */
   6#include <asm/page.h>
   7#include <linux/string.h>
   8
   9#include "mmu_rb.h"
  10#include "user_exp_rcv.h"
  11#include "trace.h"
  12
  13static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
  14                            struct exp_tid_set *set,
  15                            struct hfi1_filedata *fd);
  16static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages);
  17static int set_rcvarray_entry(struct hfi1_filedata *fd,
  18                              struct tid_user_buf *tbuf,
  19                              u32 rcventry, struct tid_group *grp,
  20                              u16 pageidx, unsigned int npages);
  21static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
  22                                    struct tid_rb_node *tnode);
  23static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
  24                              const struct mmu_notifier_range *range,
  25                              unsigned long cur_seq);
  26static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *,
  27                            struct tid_group *grp,
  28                            unsigned int start, u16 count,
  29                            u32 *tidlist, unsigned int *tididx,
  30                            unsigned int *pmapped);
  31static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
  32                              struct tid_group **grp);
  33static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node);
  34
  35static const struct mmu_interval_notifier_ops tid_mn_ops = {
  36        .invalidate = tid_rb_invalidate,
  37};
  38
  39/*
  40 * Initialize context and file private data needed for Expected
  41 * receive caching. This needs to be done after the context has
  42 * been configured with the eager/expected RcvEntry counts.
  43 */
  44int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd,
  45                           struct hfi1_ctxtdata *uctxt)
  46{
  47        int ret = 0;
  48
  49        fd->entry_to_rb = kcalloc(uctxt->expected_count,
  50                                  sizeof(struct rb_node *),
  51                                  GFP_KERNEL);
  52        if (!fd->entry_to_rb)
  53                return -ENOMEM;
  54
  55        if (!HFI1_CAP_UGET_MASK(uctxt->flags, TID_UNMAP)) {
  56                fd->invalid_tid_idx = 0;
  57                fd->invalid_tids = kcalloc(uctxt->expected_count,
  58                                           sizeof(*fd->invalid_tids),
  59                                           GFP_KERNEL);
  60                if (!fd->invalid_tids) {
  61                        kfree(fd->entry_to_rb);
  62                        fd->entry_to_rb = NULL;
  63                        return -ENOMEM;
  64                }
  65                fd->use_mn = true;
  66        }
  67
  68        /*
  69         * PSM does not have a good way to separate, count, and
  70         * effectively enforce a limit on RcvArray entries used by
  71         * subctxts (when context sharing is used) when TID caching
  72         * is enabled. To help with that, we calculate a per-process
  73         * RcvArray entry share and enforce that.
  74         * If TID caching is not in use, PSM deals with usage on its
  75         * own. In that case, we allow any subctxt to take all of the
  76         * entries.
  77         *
  78         * Make sure that we set the tid counts only after successful
  79         * init.
  80         */
  81        spin_lock(&fd->tid_lock);
  82        if (uctxt->subctxt_cnt && fd->use_mn) {
  83                u16 remainder;
  84
  85                fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt;
  86                remainder = uctxt->expected_count % uctxt->subctxt_cnt;
  87                if (remainder && fd->subctxt < remainder)
  88                        fd->tid_limit++;
  89        } else {
  90                fd->tid_limit = uctxt->expected_count;
  91        }
  92        spin_unlock(&fd->tid_lock);
  93
  94        return ret;
  95}
  96
  97void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd)
  98{
  99        struct hfi1_ctxtdata *uctxt = fd->uctxt;
 100
 101        mutex_lock(&uctxt->exp_mutex);
 102        if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list))
 103                unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd);
 104        if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list))
 105                unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd);
 106        mutex_unlock(&uctxt->exp_mutex);
 107
 108        kfree(fd->invalid_tids);
 109        fd->invalid_tids = NULL;
 110
 111        kfree(fd->entry_to_rb);
 112        fd->entry_to_rb = NULL;
 113}
 114
 115/*
 116 * Release pinned receive buffer pages.
 117 *
 118 * @mapped: true if the pages have been DMA mapped. false otherwise.
 119 * @idx: Index of the first page to unpin.
 120 * @npages: No of pages to unpin.
 121 *
 122 * If the pages have been DMA mapped (indicated by mapped parameter), their
 123 * info will be passed via a struct tid_rb_node. If they haven't been mapped,
 124 * their info will be passed via a struct tid_user_buf.
 125 */
 126static void unpin_rcv_pages(struct hfi1_filedata *fd,
 127                            struct tid_user_buf *tidbuf,
 128                            struct tid_rb_node *node,
 129                            unsigned int idx,
 130                            unsigned int npages,
 131                            bool mapped)
 132{
 133        struct page **pages;
 134        struct hfi1_devdata *dd = fd->uctxt->dd;
 135        struct mm_struct *mm;
 136
 137        if (mapped) {
 138                dma_unmap_single(&dd->pcidev->dev, node->dma_addr,
 139                                 node->npages * PAGE_SIZE, DMA_FROM_DEVICE);
 140                pages = &node->pages[idx];
 141                mm = mm_from_tid_node(node);
 142        } else {
 143                pages = &tidbuf->pages[idx];
 144                mm = current->mm;
 145        }
 146        hfi1_release_user_pages(mm, pages, npages, mapped);
 147        fd->tid_n_pinned -= npages;
 148}
 149
 150/*
 151 * Pin receive buffer pages.
 152 */
 153static int pin_rcv_pages(struct hfi1_filedata *fd, struct tid_user_buf *tidbuf)
 154{
 155        int pinned;
 156        unsigned int npages;
 157        unsigned long vaddr = tidbuf->vaddr;
 158        struct page **pages = NULL;
 159        struct hfi1_devdata *dd = fd->uctxt->dd;
 160
 161        /* Get the number of pages the user buffer spans */
 162        npages = num_user_pages(vaddr, tidbuf->length);
 163        if (!npages)
 164                return -EINVAL;
 165
 166        if (npages > fd->uctxt->expected_count) {
 167                dd_dev_err(dd, "Expected buffer too big\n");
 168                return -EINVAL;
 169        }
 170
 171        /* Allocate the array of struct page pointers needed for pinning */
 172        pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
 173        if (!pages)
 174                return -ENOMEM;
 175
 176        /*
 177         * Pin all the pages of the user buffer. If we can't pin all the
 178         * pages, accept the amount pinned so far and program only that.
 179         * User space knows how to deal with partially programmed buffers.
 180         */
 181        if (!hfi1_can_pin_pages(dd, current->mm, fd->tid_n_pinned, npages)) {
 182                kfree(pages);
 183                return -ENOMEM;
 184        }
 185
 186        pinned = hfi1_acquire_user_pages(current->mm, vaddr, npages, true, pages);
 187        if (pinned <= 0) {
 188                kfree(pages);
 189                return pinned;
 190        }
 191        tidbuf->pages = pages;
 192        tidbuf->npages = npages;
 193        fd->tid_n_pinned += pinned;
 194        return pinned;
 195}
 196
 197/*
 198 * RcvArray entry allocation for Expected Receives is done by the
 199 * following algorithm:
 200 *
 201 * The context keeps 3 lists of groups of RcvArray entries:
 202 *   1. List of empty groups - tid_group_list
 203 *      This list is created during user context creation and
 204 *      contains elements which describe sets (of 8) of empty
 205 *      RcvArray entries.
 206 *   2. List of partially used groups - tid_used_list
 207 *      This list contains sets of RcvArray entries which are
 208 *      not completely used up. Another mapping request could
 209 *      use some of all of the remaining entries.
 210 *   3. List of full groups - tid_full_list
 211 *      This is the list where sets that are completely used
 212 *      up go.
 213 *
 214 * An attempt to optimize the usage of RcvArray entries is
 215 * made by finding all sets of physically contiguous pages in a
 216 * user's buffer.
 217 * These physically contiguous sets are further split into
 218 * sizes supported by the receive engine of the HFI. The
 219 * resulting sets of pages are stored in struct tid_pageset,
 220 * which describes the sets as:
 221 *    * .count - number of pages in this set
 222 *    * .idx - starting index into struct page ** array
 223 *                    of this set
 224 *
 225 * From this point on, the algorithm deals with the page sets
 226 * described above. The number of pagesets is divided by the
 227 * RcvArray group size to produce the number of full groups
 228 * needed.
 229 *
 230 * Groups from the 3 lists are manipulated using the following
 231 * rules:
 232 *   1. For each set of 8 pagesets, a complete group from
 233 *      tid_group_list is taken, programmed, and moved to
 234 *      the tid_full_list list.
 235 *   2. For all remaining pagesets:
 236 *      2.1 If the tid_used_list is empty and the tid_group_list
 237 *          is empty, stop processing pageset and return only
 238 *          what has been programmed up to this point.
 239 *      2.2 If the tid_used_list is empty and the tid_group_list
 240 *          is not empty, move a group from tid_group_list to
 241 *          tid_used_list.
 242 *      2.3 For each group is tid_used_group, program as much as
 243 *          can fit into the group. If the group becomes fully
 244 *          used, move it to tid_full_list.
 245 */
 246int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd,
 247                            struct hfi1_tid_info *tinfo)
 248{
 249        int ret = 0, need_group = 0, pinned;
 250        struct hfi1_ctxtdata *uctxt = fd->uctxt;
 251        struct hfi1_devdata *dd = uctxt->dd;
 252        unsigned int ngroups, pageidx = 0, pageset_count,
 253                tididx = 0, mapped, mapped_pages = 0;
 254        u32 *tidlist = NULL;
 255        struct tid_user_buf *tidbuf;
 256
 257        if (!PAGE_ALIGNED(tinfo->vaddr))
 258                return -EINVAL;
 259
 260        tidbuf = kzalloc(sizeof(*tidbuf), GFP_KERNEL);
 261        if (!tidbuf)
 262                return -ENOMEM;
 263
 264        tidbuf->vaddr = tinfo->vaddr;
 265        tidbuf->length = tinfo->length;
 266        tidbuf->psets = kcalloc(uctxt->expected_count, sizeof(*tidbuf->psets),
 267                                GFP_KERNEL);
 268        if (!tidbuf->psets) {
 269                kfree(tidbuf);
 270                return -ENOMEM;
 271        }
 272
 273        pinned = pin_rcv_pages(fd, tidbuf);
 274        if (pinned <= 0) {
 275                kfree(tidbuf->psets);
 276                kfree(tidbuf);
 277                return pinned;
 278        }
 279
 280        /* Find sets of physically contiguous pages */
 281        tidbuf->n_psets = find_phys_blocks(tidbuf, pinned);
 282
 283        /*
 284         * We don't need to access this under a lock since tid_used is per
 285         * process and the same process cannot be in hfi1_user_exp_rcv_clear()
 286         * and hfi1_user_exp_rcv_setup() at the same time.
 287         */
 288        spin_lock(&fd->tid_lock);
 289        if (fd->tid_used + tidbuf->n_psets > fd->tid_limit)
 290                pageset_count = fd->tid_limit - fd->tid_used;
 291        else
 292                pageset_count = tidbuf->n_psets;
 293        spin_unlock(&fd->tid_lock);
 294
 295        if (!pageset_count)
 296                goto bail;
 297
 298        ngroups = pageset_count / dd->rcv_entries.group_size;
 299        tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL);
 300        if (!tidlist) {
 301                ret = -ENOMEM;
 302                goto nomem;
 303        }
 304
 305        tididx = 0;
 306
 307        /*
 308         * From this point on, we are going to be using shared (between master
 309         * and subcontexts) context resources. We need to take the lock.
 310         */
 311        mutex_lock(&uctxt->exp_mutex);
 312        /*
 313         * The first step is to program the RcvArray entries which are complete
 314         * groups.
 315         */
 316        while (ngroups && uctxt->tid_group_list.count) {
 317                struct tid_group *grp =
 318                        tid_group_pop(&uctxt->tid_group_list);
 319
 320                ret = program_rcvarray(fd, tidbuf, grp,
 321                                       pageidx, dd->rcv_entries.group_size,
 322                                       tidlist, &tididx, &mapped);
 323                /*
 324                 * If there was a failure to program the RcvArray
 325                 * entries for the entire group, reset the grp fields
 326                 * and add the grp back to the free group list.
 327                 */
 328                if (ret <= 0) {
 329                        tid_group_add_tail(grp, &uctxt->tid_group_list);
 330                        hfi1_cdbg(TID,
 331                                  "Failed to program RcvArray group %d", ret);
 332                        goto unlock;
 333                }
 334
 335                tid_group_add_tail(grp, &uctxt->tid_full_list);
 336                ngroups--;
 337                pageidx += ret;
 338                mapped_pages += mapped;
 339        }
 340
 341        while (pageidx < pageset_count) {
 342                struct tid_group *grp, *ptr;
 343                /*
 344                 * If we don't have any partially used tid groups, check
 345                 * if we have empty groups. If so, take one from there and
 346                 * put in the partially used list.
 347                 */
 348                if (!uctxt->tid_used_list.count || need_group) {
 349                        if (!uctxt->tid_group_list.count)
 350                                goto unlock;
 351
 352                        grp = tid_group_pop(&uctxt->tid_group_list);
 353                        tid_group_add_tail(grp, &uctxt->tid_used_list);
 354                        need_group = 0;
 355                }
 356                /*
 357                 * There is an optimization opportunity here - instead of
 358                 * fitting as many page sets as we can, check for a group
 359                 * later on in the list that could fit all of them.
 360                 */
 361                list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list,
 362                                         list) {
 363                        unsigned use = min_t(unsigned, pageset_count - pageidx,
 364                                             grp->size - grp->used);
 365
 366                        ret = program_rcvarray(fd, tidbuf, grp,
 367                                               pageidx, use, tidlist,
 368                                               &tididx, &mapped);
 369                        if (ret < 0) {
 370                                hfi1_cdbg(TID,
 371                                          "Failed to program RcvArray entries %d",
 372                                          ret);
 373                                goto unlock;
 374                        } else if (ret > 0) {
 375                                if (grp->used == grp->size)
 376                                        tid_group_move(grp,
 377                                                       &uctxt->tid_used_list,
 378                                                       &uctxt->tid_full_list);
 379                                pageidx += ret;
 380                                mapped_pages += mapped;
 381                                need_group = 0;
 382                                /* Check if we are done so we break out early */
 383                                if (pageidx >= pageset_count)
 384                                        break;
 385                        } else if (WARN_ON(ret == 0)) {
 386                                /*
 387                                 * If ret is 0, we did not program any entries
 388                                 * into this group, which can only happen if
 389                                 * we've screwed up the accounting somewhere.
 390                                 * Warn and try to continue.
 391                                 */
 392                                need_group = 1;
 393                        }
 394                }
 395        }
 396unlock:
 397        mutex_unlock(&uctxt->exp_mutex);
 398nomem:
 399        hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx,
 400                  mapped_pages, ret);
 401        if (tididx) {
 402                spin_lock(&fd->tid_lock);
 403                fd->tid_used += tididx;
 404                spin_unlock(&fd->tid_lock);
 405                tinfo->tidcnt = tididx;
 406                tinfo->length = mapped_pages * PAGE_SIZE;
 407
 408                if (copy_to_user(u64_to_user_ptr(tinfo->tidlist),
 409                                 tidlist, sizeof(tidlist[0]) * tididx)) {
 410                        /*
 411                         * On failure to copy to the user level, we need to undo
 412                         * everything done so far so we don't leak resources.
 413                         */
 414                        tinfo->tidlist = (unsigned long)&tidlist;
 415                        hfi1_user_exp_rcv_clear(fd, tinfo);
 416                        tinfo->tidlist = 0;
 417                        ret = -EFAULT;
 418                        goto bail;
 419                }
 420        }
 421
 422        /*
 423         * If not everything was mapped (due to insufficient RcvArray entries,
 424         * for example), unpin all unmapped pages so we can pin them nex time.
 425         */
 426        if (mapped_pages != pinned)
 427                unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages,
 428                                (pinned - mapped_pages), false);
 429bail:
 430        kfree(tidbuf->psets);
 431        kfree(tidlist);
 432        kfree(tidbuf->pages);
 433        kfree(tidbuf);
 434        return ret > 0 ? 0 : ret;
 435}
 436
 437int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd,
 438                            struct hfi1_tid_info *tinfo)
 439{
 440        int ret = 0;
 441        struct hfi1_ctxtdata *uctxt = fd->uctxt;
 442        u32 *tidinfo;
 443        unsigned tididx;
 444
 445        if (unlikely(tinfo->tidcnt > fd->tid_used))
 446                return -EINVAL;
 447
 448        tidinfo = memdup_user(u64_to_user_ptr(tinfo->tidlist),
 449                              sizeof(tidinfo[0]) * tinfo->tidcnt);
 450        if (IS_ERR(tidinfo))
 451                return PTR_ERR(tidinfo);
 452
 453        mutex_lock(&uctxt->exp_mutex);
 454        for (tididx = 0; tididx < tinfo->tidcnt; tididx++) {
 455                ret = unprogram_rcvarray(fd, tidinfo[tididx], NULL);
 456                if (ret) {
 457                        hfi1_cdbg(TID, "Failed to unprogram rcv array %d",
 458                                  ret);
 459                        break;
 460                }
 461        }
 462        spin_lock(&fd->tid_lock);
 463        fd->tid_used -= tididx;
 464        spin_unlock(&fd->tid_lock);
 465        tinfo->tidcnt = tididx;
 466        mutex_unlock(&uctxt->exp_mutex);
 467
 468        kfree(tidinfo);
 469        return ret;
 470}
 471
 472int hfi1_user_exp_rcv_invalid(struct hfi1_filedata *fd,
 473                              struct hfi1_tid_info *tinfo)
 474{
 475        struct hfi1_ctxtdata *uctxt = fd->uctxt;
 476        unsigned long *ev = uctxt->dd->events +
 477                (uctxt_offset(uctxt) + fd->subctxt);
 478        u32 *array;
 479        int ret = 0;
 480
 481        /*
 482         * copy_to_user() can sleep, which will leave the invalid_lock
 483         * locked and cause the MMU notifier to be blocked on the lock
 484         * for a long time.
 485         * Copy the data to a local buffer so we can release the lock.
 486         */
 487        array = kcalloc(uctxt->expected_count, sizeof(*array), GFP_KERNEL);
 488        if (!array)
 489                return -EFAULT;
 490
 491        spin_lock(&fd->invalid_lock);
 492        if (fd->invalid_tid_idx) {
 493                memcpy(array, fd->invalid_tids, sizeof(*array) *
 494                       fd->invalid_tid_idx);
 495                memset(fd->invalid_tids, 0, sizeof(*fd->invalid_tids) *
 496                       fd->invalid_tid_idx);
 497                tinfo->tidcnt = fd->invalid_tid_idx;
 498                fd->invalid_tid_idx = 0;
 499                /*
 500                 * Reset the user flag while still holding the lock.
 501                 * Otherwise, PSM can miss events.
 502                 */
 503                clear_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
 504        } else {
 505                tinfo->tidcnt = 0;
 506        }
 507        spin_unlock(&fd->invalid_lock);
 508
 509        if (tinfo->tidcnt) {
 510                if (copy_to_user((void __user *)tinfo->tidlist,
 511                                 array, sizeof(*array) * tinfo->tidcnt))
 512                        ret = -EFAULT;
 513        }
 514        kfree(array);
 515
 516        return ret;
 517}
 518
 519static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages)
 520{
 521        unsigned pagecount, pageidx, setcount = 0, i;
 522        unsigned long pfn, this_pfn;
 523        struct page **pages = tidbuf->pages;
 524        struct tid_pageset *list = tidbuf->psets;
 525
 526        if (!npages)
 527                return 0;
 528
 529        /*
 530         * Look for sets of physically contiguous pages in the user buffer.
 531         * This will allow us to optimize Expected RcvArray entry usage by
 532         * using the bigger supported sizes.
 533         */
 534        pfn = page_to_pfn(pages[0]);
 535        for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
 536                this_pfn = i < npages ? page_to_pfn(pages[i]) : 0;
 537
 538                /*
 539                 * If the pfn's are not sequential, pages are not physically
 540                 * contiguous.
 541                 */
 542                if (this_pfn != ++pfn) {
 543                        /*
 544                         * At this point we have to loop over the set of
 545                         * physically contiguous pages and break them down it
 546                         * sizes supported by the HW.
 547                         * There are two main constraints:
 548                         *     1. The max buffer size is MAX_EXPECTED_BUFFER.
 549                         *        If the total set size is bigger than that
 550                         *        program only a MAX_EXPECTED_BUFFER chunk.
 551                         *     2. The buffer size has to be a power of two. If
 552                         *        it is not, round down to the closes power of
 553                         *        2 and program that size.
 554                         */
 555                        while (pagecount) {
 556                                int maxpages = pagecount;
 557                                u32 bufsize = pagecount * PAGE_SIZE;
 558
 559                                if (bufsize > MAX_EXPECTED_BUFFER)
 560                                        maxpages =
 561                                                MAX_EXPECTED_BUFFER >>
 562                                                PAGE_SHIFT;
 563                                else if (!is_power_of_2(bufsize))
 564                                        maxpages =
 565                                                rounddown_pow_of_two(bufsize) >>
 566                                                PAGE_SHIFT;
 567
 568                                list[setcount].idx = pageidx;
 569                                list[setcount].count = maxpages;
 570                                pagecount -= maxpages;
 571                                pageidx += maxpages;
 572                                setcount++;
 573                        }
 574                        pageidx = i;
 575                        pagecount = 1;
 576                        pfn = this_pfn;
 577                } else {
 578                        pagecount++;
 579                }
 580        }
 581        return setcount;
 582}
 583
 584/**
 585 * program_rcvarray() - program an RcvArray group with receive buffers
 586 * @fd: filedata pointer
 587 * @tbuf: pointer to struct tid_user_buf that has the user buffer starting
 588 *        virtual address, buffer length, page pointers, pagesets (array of
 589 *        struct tid_pageset holding information on physically contiguous
 590 *        chunks from the user buffer), and other fields.
 591 * @grp: RcvArray group
 592 * @start: starting index into sets array
 593 * @count: number of struct tid_pageset's to program
 594 * @tidlist: the array of u32 elements when the information about the
 595 *           programmed RcvArray entries is to be encoded.
 596 * @tididx: starting offset into tidlist
 597 * @pmapped: (output parameter) number of pages programmed into the RcvArray
 598 *           entries.
 599 *
 600 * This function will program up to 'count' number of RcvArray entries from the
 601 * group 'grp'. To make best use of write-combining writes, the function will
 602 * perform writes to the unused RcvArray entries which will be ignored by the
 603 * HW. Each RcvArray entry will be programmed with a physically contiguous
 604 * buffer chunk from the user's virtual buffer.
 605 *
 606 * Return:
 607 * -EINVAL if the requested count is larger than the size of the group,
 608 * -ENOMEM or -EFAULT on error from set_rcvarray_entry(), or
 609 * number of RcvArray entries programmed.
 610 */
 611static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *tbuf,
 612                            struct tid_group *grp,
 613                            unsigned int start, u16 count,
 614                            u32 *tidlist, unsigned int *tididx,
 615                            unsigned int *pmapped)
 616{
 617        struct hfi1_ctxtdata *uctxt = fd->uctxt;
 618        struct hfi1_devdata *dd = uctxt->dd;
 619        u16 idx;
 620        u32 tidinfo = 0, rcventry, useidx = 0;
 621        int mapped = 0;
 622
 623        /* Count should never be larger than the group size */
 624        if (count > grp->size)
 625                return -EINVAL;
 626
 627        /* Find the first unused entry in the group */
 628        for (idx = 0; idx < grp->size; idx++) {
 629                if (!(grp->map & (1 << idx))) {
 630                        useidx = idx;
 631                        break;
 632                }
 633                rcv_array_wc_fill(dd, grp->base + idx);
 634        }
 635
 636        idx = 0;
 637        while (idx < count) {
 638                u16 npages, pageidx, setidx = start + idx;
 639                int ret = 0;
 640
 641                /*
 642                 * If this entry in the group is used, move to the next one.
 643                 * If we go past the end of the group, exit the loop.
 644                 */
 645                if (useidx >= grp->size) {
 646                        break;
 647                } else if (grp->map & (1 << useidx)) {
 648                        rcv_array_wc_fill(dd, grp->base + useidx);
 649                        useidx++;
 650                        continue;
 651                }
 652
 653                rcventry = grp->base + useidx;
 654                npages = tbuf->psets[setidx].count;
 655                pageidx = tbuf->psets[setidx].idx;
 656
 657                ret = set_rcvarray_entry(fd, tbuf,
 658                                         rcventry, grp, pageidx,
 659                                         npages);
 660                if (ret)
 661                        return ret;
 662                mapped += npages;
 663
 664                tidinfo = rcventry2tidinfo(rcventry - uctxt->expected_base) |
 665                        EXP_TID_SET(LEN, npages);
 666                tidlist[(*tididx)++] = tidinfo;
 667                grp->used++;
 668                grp->map |= 1 << useidx++;
 669                idx++;
 670        }
 671
 672        /* Fill the rest of the group with "blank" writes */
 673        for (; useidx < grp->size; useidx++)
 674                rcv_array_wc_fill(dd, grp->base + useidx);
 675        *pmapped = mapped;
 676        return idx;
 677}
 678
 679static int set_rcvarray_entry(struct hfi1_filedata *fd,
 680                              struct tid_user_buf *tbuf,
 681                              u32 rcventry, struct tid_group *grp,
 682                              u16 pageidx, unsigned int npages)
 683{
 684        int ret;
 685        struct hfi1_ctxtdata *uctxt = fd->uctxt;
 686        struct tid_rb_node *node;
 687        struct hfi1_devdata *dd = uctxt->dd;
 688        dma_addr_t phys;
 689        struct page **pages = tbuf->pages + pageidx;
 690
 691        /*
 692         * Allocate the node first so we can handle a potential
 693         * failure before we've programmed anything.
 694         */
 695        node = kzalloc(sizeof(*node) + (sizeof(struct page *) * npages),
 696                       GFP_KERNEL);
 697        if (!node)
 698                return -ENOMEM;
 699
 700        phys = dma_map_single(&dd->pcidev->dev, __va(page_to_phys(pages[0])),
 701                              npages * PAGE_SIZE, DMA_FROM_DEVICE);
 702        if (dma_mapping_error(&dd->pcidev->dev, phys)) {
 703                dd_dev_err(dd, "Failed to DMA map Exp Rcv pages 0x%llx\n",
 704                           phys);
 705                kfree(node);
 706                return -EFAULT;
 707        }
 708
 709        node->fdata = fd;
 710        node->phys = page_to_phys(pages[0]);
 711        node->npages = npages;
 712        node->rcventry = rcventry;
 713        node->dma_addr = phys;
 714        node->grp = grp;
 715        node->freed = false;
 716        memcpy(node->pages, pages, sizeof(struct page *) * npages);
 717
 718        if (fd->use_mn) {
 719                ret = mmu_interval_notifier_insert(
 720                        &node->notifier, current->mm,
 721                        tbuf->vaddr + (pageidx * PAGE_SIZE), npages * PAGE_SIZE,
 722                        &tid_mn_ops);
 723                if (ret)
 724                        goto out_unmap;
 725                /*
 726                 * FIXME: This is in the wrong order, the notifier should be
 727                 * established before the pages are pinned by pin_rcv_pages.
 728                 */
 729                mmu_interval_read_begin(&node->notifier);
 730        }
 731        fd->entry_to_rb[node->rcventry - uctxt->expected_base] = node;
 732
 733        hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1);
 734        trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry, npages,
 735                               node->notifier.interval_tree.start, node->phys,
 736                               phys);
 737        return 0;
 738
 739out_unmap:
 740        hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d",
 741                  node->rcventry, node->notifier.interval_tree.start,
 742                  node->phys, ret);
 743        dma_unmap_single(&dd->pcidev->dev, phys, npages * PAGE_SIZE,
 744                         DMA_FROM_DEVICE);
 745        kfree(node);
 746        return -EFAULT;
 747}
 748
 749static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
 750                              struct tid_group **grp)
 751{
 752        struct hfi1_ctxtdata *uctxt = fd->uctxt;
 753        struct hfi1_devdata *dd = uctxt->dd;
 754        struct tid_rb_node *node;
 755        u8 tidctrl = EXP_TID_GET(tidinfo, CTRL);
 756        u32 tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry;
 757
 758        if (tididx >= uctxt->expected_count) {
 759                dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n",
 760                           tididx, uctxt->ctxt);
 761                return -EINVAL;
 762        }
 763
 764        if (tidctrl == 0x3)
 765                return -EINVAL;
 766
 767        rcventry = tididx + (tidctrl - 1);
 768
 769        node = fd->entry_to_rb[rcventry];
 770        if (!node || node->rcventry != (uctxt->expected_base + rcventry))
 771                return -EBADF;
 772
 773        if (grp)
 774                *grp = node->grp;
 775
 776        if (fd->use_mn)
 777                mmu_interval_notifier_remove(&node->notifier);
 778        cacheless_tid_rb_remove(fd, node);
 779
 780        return 0;
 781}
 782
 783static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
 784{
 785        struct hfi1_ctxtdata *uctxt = fd->uctxt;
 786        struct hfi1_devdata *dd = uctxt->dd;
 787
 788        trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry,
 789                                 node->npages,
 790                                 node->notifier.interval_tree.start, node->phys,
 791                                 node->dma_addr);
 792
 793        /*
 794         * Make sure device has seen the write before we unpin the
 795         * pages.
 796         */
 797        hfi1_put_tid(dd, node->rcventry, PT_INVALID_FLUSH, 0, 0);
 798
 799        unpin_rcv_pages(fd, NULL, node, 0, node->npages, true);
 800
 801        node->grp->used--;
 802        node->grp->map &= ~(1 << (node->rcventry - node->grp->base));
 803
 804        if (node->grp->used == node->grp->size - 1)
 805                tid_group_move(node->grp, &uctxt->tid_full_list,
 806                               &uctxt->tid_used_list);
 807        else if (!node->grp->used)
 808                tid_group_move(node->grp, &uctxt->tid_used_list,
 809                               &uctxt->tid_group_list);
 810        kfree(node);
 811}
 812
 813/*
 814 * As a simple helper for hfi1_user_exp_rcv_free, this function deals with
 815 * clearing nodes in the non-cached case.
 816 */
 817static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
 818                            struct exp_tid_set *set,
 819                            struct hfi1_filedata *fd)
 820{
 821        struct tid_group *grp, *ptr;
 822        int i;
 823
 824        list_for_each_entry_safe(grp, ptr, &set->list, list) {
 825                list_del_init(&grp->list);
 826
 827                for (i = 0; i < grp->size; i++) {
 828                        if (grp->map & (1 << i)) {
 829                                u16 rcventry = grp->base + i;
 830                                struct tid_rb_node *node;
 831
 832                                node = fd->entry_to_rb[rcventry -
 833                                                          uctxt->expected_base];
 834                                if (!node || node->rcventry != rcventry)
 835                                        continue;
 836
 837                                if (fd->use_mn)
 838                                        mmu_interval_notifier_remove(
 839                                                &node->notifier);
 840                                cacheless_tid_rb_remove(fd, node);
 841                        }
 842                }
 843        }
 844}
 845
 846static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
 847                              const struct mmu_notifier_range *range,
 848                              unsigned long cur_seq)
 849{
 850        struct tid_rb_node *node =
 851                container_of(mni, struct tid_rb_node, notifier);
 852        struct hfi1_filedata *fdata = node->fdata;
 853        struct hfi1_ctxtdata *uctxt = fdata->uctxt;
 854
 855        if (node->freed)
 856                return true;
 857
 858        trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt,
 859                                 node->notifier.interval_tree.start,
 860                                 node->rcventry, node->npages, node->dma_addr);
 861        node->freed = true;
 862
 863        spin_lock(&fdata->invalid_lock);
 864        if (fdata->invalid_tid_idx < uctxt->expected_count) {
 865                fdata->invalid_tids[fdata->invalid_tid_idx] =
 866                        rcventry2tidinfo(node->rcventry - uctxt->expected_base);
 867                fdata->invalid_tids[fdata->invalid_tid_idx] |=
 868                        EXP_TID_SET(LEN, node->npages);
 869                if (!fdata->invalid_tid_idx) {
 870                        unsigned long *ev;
 871
 872                        /*
 873                         * hfi1_set_uevent_bits() sets a user event flag
 874                         * for all processes. Because calling into the
 875                         * driver to process TID cache invalidations is
 876                         * expensive and TID cache invalidations are
 877                         * handled on a per-process basis, we can
 878                         * optimize this to set the flag only for the
 879                         * process in question.
 880                         */
 881                        ev = uctxt->dd->events +
 882                                (uctxt_offset(uctxt) + fdata->subctxt);
 883                        set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
 884                }
 885                fdata->invalid_tid_idx++;
 886        }
 887        spin_unlock(&fdata->invalid_lock);
 888        return true;
 889}
 890
 891static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
 892                                    struct tid_rb_node *tnode)
 893{
 894        u32 base = fdata->uctxt->expected_base;
 895
 896        fdata->entry_to_rb[tnode->rcventry - base] = NULL;
 897        clear_tid_node(fdata, tnode);
 898}
 899