linux/drivers/infiniband/hw/hfi1/user_exp_rcv.c
<<
>>
Prefs
   1/*
   2 * Copyright(c) 2015-2018 Intel Corporation.
   3 *
   4 * This file is provided under a dual BSD/GPLv2 license.  When using or
   5 * redistributing this file, you may do so under either license.
   6 *
   7 * GPL LICENSE SUMMARY
   8 *
   9 * This program is free software; you can redistribute it and/or modify
  10 * it under the terms of version 2 of the GNU General Public License as
  11 * published by the Free Software Foundation.
  12 *
  13 * This program is distributed in the hope that it will be useful, but
  14 * WITHOUT ANY WARRANTY; without even the implied warranty of
  15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16 * General Public License for more details.
  17 *
  18 * BSD LICENSE
  19 *
  20 * Redistribution and use in source and binary forms, with or without
  21 * modification, are permitted provided that the following conditions
  22 * are met:
  23 *
  24 *  - Redistributions of source code must retain the above copyright
  25 *    notice, this list of conditions and the following disclaimer.
  26 *  - Redistributions in binary form must reproduce the above copyright
  27 *    notice, this list of conditions and the following disclaimer in
  28 *    the documentation and/or other materials provided with the
  29 *    distribution.
  30 *  - Neither the name of Intel Corporation nor the names of its
  31 *    contributors may be used to endorse or promote products derived
  32 *    from this software without specific prior written permission.
  33 *
  34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  45 *
  46 */
  47#include <asm/page.h>
  48#include <linux/string.h>
  49
  50#include "mmu_rb.h"
  51#include "user_exp_rcv.h"
  52#include "trace.h"
  53
  54static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
  55                            struct exp_tid_set *set,
  56                            struct hfi1_filedata *fd);
  57static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages);
  58static int set_rcvarray_entry(struct hfi1_filedata *fd,
  59                              struct tid_user_buf *tbuf,
  60                              u32 rcventry, struct tid_group *grp,
  61                              u16 pageidx, unsigned int npages);
  62static int tid_rb_insert(void *arg, struct mmu_rb_node *node);
  63static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
  64                                    struct tid_rb_node *tnode);
  65static void tid_rb_remove(void *arg, struct mmu_rb_node *node);
  66static int tid_rb_invalidate(void *arg, struct mmu_rb_node *mnode);
  67static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *,
  68                            struct tid_group *grp,
  69                            unsigned int start, u16 count,
  70                            u32 *tidlist, unsigned int *tididx,
  71                            unsigned int *pmapped);
  72static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
  73                              struct tid_group **grp);
  74static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node);
  75
  76static struct mmu_rb_ops tid_rb_ops = {
  77        .insert = tid_rb_insert,
  78        .remove = tid_rb_remove,
  79        .invalidate = tid_rb_invalidate
  80};
  81
  82/*
  83 * Initialize context and file private data needed for Expected
  84 * receive caching. This needs to be done after the context has
  85 * been configured with the eager/expected RcvEntry counts.
  86 */
  87int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd,
  88                           struct hfi1_ctxtdata *uctxt)
  89{
  90        struct hfi1_devdata *dd = uctxt->dd;
  91        int ret = 0;
  92
  93        spin_lock_init(&fd->tid_lock);
  94        spin_lock_init(&fd->invalid_lock);
  95
  96        fd->entry_to_rb = kcalloc(uctxt->expected_count,
  97                                  sizeof(struct rb_node *),
  98                                  GFP_KERNEL);
  99        if (!fd->entry_to_rb)
 100                return -ENOMEM;
 101
 102        if (!HFI1_CAP_UGET_MASK(uctxt->flags, TID_UNMAP)) {
 103                fd->invalid_tid_idx = 0;
 104                fd->invalid_tids = kcalloc(uctxt->expected_count,
 105                                           sizeof(*fd->invalid_tids),
 106                                           GFP_KERNEL);
 107                if (!fd->invalid_tids) {
 108                        kfree(fd->entry_to_rb);
 109                        fd->entry_to_rb = NULL;
 110                        return -ENOMEM;
 111                }
 112
 113                /*
 114                 * Register MMU notifier callbacks. If the registration
 115                 * fails, continue without TID caching for this context.
 116                 */
 117                ret = hfi1_mmu_rb_register(fd, fd->mm, &tid_rb_ops,
 118                                           dd->pport->hfi1_wq,
 119                                           &fd->handler);
 120                if (ret) {
 121                        dd_dev_info(dd,
 122                                    "Failed MMU notifier registration %d\n",
 123                                    ret);
 124                        ret = 0;
 125                }
 126        }
 127
 128        /*
 129         * PSM does not have a good way to separate, count, and
 130         * effectively enforce a limit on RcvArray entries used by
 131         * subctxts (when context sharing is used) when TID caching
 132         * is enabled. To help with that, we calculate a per-process
 133         * RcvArray entry share and enforce that.
 134         * If TID caching is not in use, PSM deals with usage on its
 135         * own. In that case, we allow any subctxt to take all of the
 136         * entries.
 137         *
 138         * Make sure that we set the tid counts only after successful
 139         * init.
 140         */
 141        spin_lock(&fd->tid_lock);
 142        if (uctxt->subctxt_cnt && fd->handler) {
 143                u16 remainder;
 144
 145                fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt;
 146                remainder = uctxt->expected_count % uctxt->subctxt_cnt;
 147                if (remainder && fd->subctxt < remainder)
 148                        fd->tid_limit++;
 149        } else {
 150                fd->tid_limit = uctxt->expected_count;
 151        }
 152        spin_unlock(&fd->tid_lock);
 153
 154        return ret;
 155}
 156
 157void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd)
 158{
 159        struct hfi1_ctxtdata *uctxt = fd->uctxt;
 160
 161        /*
 162         * The notifier would have been removed when the process'es mm
 163         * was freed.
 164         */
 165        if (fd->handler) {
 166                hfi1_mmu_rb_unregister(fd->handler);
 167        } else {
 168                if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list))
 169                        unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd);
 170                if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list))
 171                        unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd);
 172        }
 173
 174        kfree(fd->invalid_tids);
 175        fd->invalid_tids = NULL;
 176
 177        kfree(fd->entry_to_rb);
 178        fd->entry_to_rb = NULL;
 179}
 180
 181/**
 182 * Release pinned receive buffer pages.
 183 *
 184 * @mapped - true if the pages have been DMA mapped. false otherwise.
 185 * @idx - Index of the first page to unpin.
 186 * @npages - No of pages to unpin.
 187 *
 188 * If the pages have been DMA mapped (indicated by mapped parameter), their
 189 * info will be passed via a struct tid_rb_node. If they haven't been mapped,
 190 * their info will be passed via a struct tid_user_buf.
 191 */
 192static void unpin_rcv_pages(struct hfi1_filedata *fd,
 193                            struct tid_user_buf *tidbuf,
 194                            struct tid_rb_node *node,
 195                            unsigned int idx,
 196                            unsigned int npages,
 197                            bool mapped)
 198{
 199        struct page **pages;
 200        struct hfi1_devdata *dd = fd->uctxt->dd;
 201
 202        if (mapped) {
 203                pci_unmap_single(dd->pcidev, node->dma_addr,
 204                                 node->mmu.len, PCI_DMA_FROMDEVICE);
 205                pages = &node->pages[idx];
 206        } else {
 207                pages = &tidbuf->pages[idx];
 208        }
 209        hfi1_release_user_pages(fd->mm, pages, npages, mapped);
 210        fd->tid_n_pinned -= npages;
 211}
 212
 213/**
 214 * Pin receive buffer pages.
 215 */
 216static int pin_rcv_pages(struct hfi1_filedata *fd, struct tid_user_buf *tidbuf)
 217{
 218        int pinned;
 219        unsigned int npages;
 220        unsigned long vaddr = tidbuf->vaddr;
 221        struct page **pages = NULL;
 222        struct hfi1_devdata *dd = fd->uctxt->dd;
 223
 224        /* Get the number of pages the user buffer spans */
 225        npages = num_user_pages(vaddr, tidbuf->length);
 226        if (!npages)
 227                return -EINVAL;
 228
 229        if (npages > fd->uctxt->expected_count) {
 230                dd_dev_err(dd, "Expected buffer too big\n");
 231                return -EINVAL;
 232        }
 233
 234        /* Verify that access is OK for the user buffer */
 235        if (!access_ok((void __user *)vaddr,
 236                       npages * PAGE_SIZE)) {
 237                dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n",
 238                           (void *)vaddr, npages);
 239                return -EFAULT;
 240        }
 241        /* Allocate the array of struct page pointers needed for pinning */
 242        pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
 243        if (!pages)
 244                return -ENOMEM;
 245
 246        /*
 247         * Pin all the pages of the user buffer. If we can't pin all the
 248         * pages, accept the amount pinned so far and program only that.
 249         * User space knows how to deal with partially programmed buffers.
 250         */
 251        if (!hfi1_can_pin_pages(dd, fd->mm, fd->tid_n_pinned, npages)) {
 252                kfree(pages);
 253                return -ENOMEM;
 254        }
 255
 256        pinned = hfi1_acquire_user_pages(fd->mm, vaddr, npages, true, pages);
 257        if (pinned <= 0) {
 258                kfree(pages);
 259                return pinned;
 260        }
 261        tidbuf->pages = pages;
 262        tidbuf->npages = npages;
 263        fd->tid_n_pinned += pinned;
 264        return pinned;
 265}
 266
 267/*
 268 * RcvArray entry allocation for Expected Receives is done by the
 269 * following algorithm:
 270 *
 271 * The context keeps 3 lists of groups of RcvArray entries:
 272 *   1. List of empty groups - tid_group_list
 273 *      This list is created during user context creation and
 274 *      contains elements which describe sets (of 8) of empty
 275 *      RcvArray entries.
 276 *   2. List of partially used groups - tid_used_list
 277 *      This list contains sets of RcvArray entries which are
 278 *      not completely used up. Another mapping request could
 279 *      use some of all of the remaining entries.
 280 *   3. List of full groups - tid_full_list
 281 *      This is the list where sets that are completely used
 282 *      up go.
 283 *
 284 * An attempt to optimize the usage of RcvArray entries is
 285 * made by finding all sets of physically contiguous pages in a
 286 * user's buffer.
 287 * These physically contiguous sets are further split into
 288 * sizes supported by the receive engine of the HFI. The
 289 * resulting sets of pages are stored in struct tid_pageset,
 290 * which describes the sets as:
 291 *    * .count - number of pages in this set
 292 *    * .idx - starting index into struct page ** array
 293 *                    of this set
 294 *
 295 * From this point on, the algorithm deals with the page sets
 296 * described above. The number of pagesets is divided by the
 297 * RcvArray group size to produce the number of full groups
 298 * needed.
 299 *
 300 * Groups from the 3 lists are manipulated using the following
 301 * rules:
 302 *   1. For each set of 8 pagesets, a complete group from
 303 *      tid_group_list is taken, programmed, and moved to
 304 *      the tid_full_list list.
 305 *   2. For all remaining pagesets:
 306 *      2.1 If the tid_used_list is empty and the tid_group_list
 307 *          is empty, stop processing pageset and return only
 308 *          what has been programmed up to this point.
 309 *      2.2 If the tid_used_list is empty and the tid_group_list
 310 *          is not empty, move a group from tid_group_list to
 311 *          tid_used_list.
 312 *      2.3 For each group is tid_used_group, program as much as
 313 *          can fit into the group. If the group becomes fully
 314 *          used, move it to tid_full_list.
 315 */
 316int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd,
 317                            struct hfi1_tid_info *tinfo)
 318{
 319        int ret = 0, need_group = 0, pinned;
 320        struct hfi1_ctxtdata *uctxt = fd->uctxt;
 321        struct hfi1_devdata *dd = uctxt->dd;
 322        unsigned int ngroups, pageidx = 0, pageset_count,
 323                tididx = 0, mapped, mapped_pages = 0;
 324        u32 *tidlist = NULL;
 325        struct tid_user_buf *tidbuf;
 326
 327        if (!PAGE_ALIGNED(tinfo->vaddr))
 328                return -EINVAL;
 329
 330        tidbuf = kzalloc(sizeof(*tidbuf), GFP_KERNEL);
 331        if (!tidbuf)
 332                return -ENOMEM;
 333
 334        tidbuf->vaddr = tinfo->vaddr;
 335        tidbuf->length = tinfo->length;
 336        tidbuf->psets = kcalloc(uctxt->expected_count, sizeof(*tidbuf->psets),
 337                                GFP_KERNEL);
 338        if (!tidbuf->psets) {
 339                kfree(tidbuf);
 340                return -ENOMEM;
 341        }
 342
 343        pinned = pin_rcv_pages(fd, tidbuf);
 344        if (pinned <= 0) {
 345                kfree(tidbuf->psets);
 346                kfree(tidbuf);
 347                return pinned;
 348        }
 349
 350        /* Find sets of physically contiguous pages */
 351        tidbuf->n_psets = find_phys_blocks(tidbuf, pinned);
 352
 353        /*
 354         * We don't need to access this under a lock since tid_used is per
 355         * process and the same process cannot be in hfi1_user_exp_rcv_clear()
 356         * and hfi1_user_exp_rcv_setup() at the same time.
 357         */
 358        spin_lock(&fd->tid_lock);
 359        if (fd->tid_used + tidbuf->n_psets > fd->tid_limit)
 360                pageset_count = fd->tid_limit - fd->tid_used;
 361        else
 362                pageset_count = tidbuf->n_psets;
 363        spin_unlock(&fd->tid_lock);
 364
 365        if (!pageset_count)
 366                goto bail;
 367
 368        ngroups = pageset_count / dd->rcv_entries.group_size;
 369        tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL);
 370        if (!tidlist) {
 371                ret = -ENOMEM;
 372                goto nomem;
 373        }
 374
 375        tididx = 0;
 376
 377        /*
 378         * From this point on, we are going to be using shared (between master
 379         * and subcontexts) context resources. We need to take the lock.
 380         */
 381        mutex_lock(&uctxt->exp_mutex);
 382        /*
 383         * The first step is to program the RcvArray entries which are complete
 384         * groups.
 385         */
 386        while (ngroups && uctxt->tid_group_list.count) {
 387                struct tid_group *grp =
 388                        tid_group_pop(&uctxt->tid_group_list);
 389
 390                ret = program_rcvarray(fd, tidbuf, grp,
 391                                       pageidx, dd->rcv_entries.group_size,
 392                                       tidlist, &tididx, &mapped);
 393                /*
 394                 * If there was a failure to program the RcvArray
 395                 * entries for the entire group, reset the grp fields
 396                 * and add the grp back to the free group list.
 397                 */
 398                if (ret <= 0) {
 399                        tid_group_add_tail(grp, &uctxt->tid_group_list);
 400                        hfi1_cdbg(TID,
 401                                  "Failed to program RcvArray group %d", ret);
 402                        goto unlock;
 403                }
 404
 405                tid_group_add_tail(grp, &uctxt->tid_full_list);
 406                ngroups--;
 407                pageidx += ret;
 408                mapped_pages += mapped;
 409        }
 410
 411        while (pageidx < pageset_count) {
 412                struct tid_group *grp, *ptr;
 413                /*
 414                 * If we don't have any partially used tid groups, check
 415                 * if we have empty groups. If so, take one from there and
 416                 * put in the partially used list.
 417                 */
 418                if (!uctxt->tid_used_list.count || need_group) {
 419                        if (!uctxt->tid_group_list.count)
 420                                goto unlock;
 421
 422                        grp = tid_group_pop(&uctxt->tid_group_list);
 423                        tid_group_add_tail(grp, &uctxt->tid_used_list);
 424                        need_group = 0;
 425                }
 426                /*
 427                 * There is an optimization opportunity here - instead of
 428                 * fitting as many page sets as we can, check for a group
 429                 * later on in the list that could fit all of them.
 430                 */
 431                list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list,
 432                                         list) {
 433                        unsigned use = min_t(unsigned, pageset_count - pageidx,
 434                                             grp->size - grp->used);
 435
 436                        ret = program_rcvarray(fd, tidbuf, grp,
 437                                               pageidx, use, tidlist,
 438                                               &tididx, &mapped);
 439                        if (ret < 0) {
 440                                hfi1_cdbg(TID,
 441                                          "Failed to program RcvArray entries %d",
 442                                          ret);
 443                                goto unlock;
 444                        } else if (ret > 0) {
 445                                if (grp->used == grp->size)
 446                                        tid_group_move(grp,
 447                                                       &uctxt->tid_used_list,
 448                                                       &uctxt->tid_full_list);
 449                                pageidx += ret;
 450                                mapped_pages += mapped;
 451                                need_group = 0;
 452                                /* Check if we are done so we break out early */
 453                                if (pageidx >= pageset_count)
 454                                        break;
 455                        } else if (WARN_ON(ret == 0)) {
 456                                /*
 457                                 * If ret is 0, we did not program any entries
 458                                 * into this group, which can only happen if
 459                                 * we've screwed up the accounting somewhere.
 460                                 * Warn and try to continue.
 461                                 */
 462                                need_group = 1;
 463                        }
 464                }
 465        }
 466unlock:
 467        mutex_unlock(&uctxt->exp_mutex);
 468nomem:
 469        hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx,
 470                  mapped_pages, ret);
 471        if (tididx) {
 472                spin_lock(&fd->tid_lock);
 473                fd->tid_used += tididx;
 474                spin_unlock(&fd->tid_lock);
 475                tinfo->tidcnt = tididx;
 476                tinfo->length = mapped_pages * PAGE_SIZE;
 477
 478                if (copy_to_user(u64_to_user_ptr(tinfo->tidlist),
 479                                 tidlist, sizeof(tidlist[0]) * tididx)) {
 480                        /*
 481                         * On failure to copy to the user level, we need to undo
 482                         * everything done so far so we don't leak resources.
 483                         */
 484                        tinfo->tidlist = (unsigned long)&tidlist;
 485                        hfi1_user_exp_rcv_clear(fd, tinfo);
 486                        tinfo->tidlist = 0;
 487                        ret = -EFAULT;
 488                        goto bail;
 489                }
 490        }
 491
 492        /*
 493         * If not everything was mapped (due to insufficient RcvArray entries,
 494         * for example), unpin all unmapped pages so we can pin them nex time.
 495         */
 496        if (mapped_pages != pinned)
 497                unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages,
 498                                (pinned - mapped_pages), false);
 499bail:
 500        kfree(tidbuf->psets);
 501        kfree(tidlist);
 502        kfree(tidbuf->pages);
 503        kfree(tidbuf);
 504        return ret > 0 ? 0 : ret;
 505}
 506
 507int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd,
 508                            struct hfi1_tid_info *tinfo)
 509{
 510        int ret = 0;
 511        struct hfi1_ctxtdata *uctxt = fd->uctxt;
 512        u32 *tidinfo;
 513        unsigned tididx;
 514
 515        if (unlikely(tinfo->tidcnt > fd->tid_used))
 516                return -EINVAL;
 517
 518        tidinfo = memdup_user(u64_to_user_ptr(tinfo->tidlist),
 519                              sizeof(tidinfo[0]) * tinfo->tidcnt);
 520        if (IS_ERR(tidinfo))
 521                return PTR_ERR(tidinfo);
 522
 523        mutex_lock(&uctxt->exp_mutex);
 524        for (tididx = 0; tididx < tinfo->tidcnt; tididx++) {
 525                ret = unprogram_rcvarray(fd, tidinfo[tididx], NULL);
 526                if (ret) {
 527                        hfi1_cdbg(TID, "Failed to unprogram rcv array %d",
 528                                  ret);
 529                        break;
 530                }
 531        }
 532        spin_lock(&fd->tid_lock);
 533        fd->tid_used -= tididx;
 534        spin_unlock(&fd->tid_lock);
 535        tinfo->tidcnt = tididx;
 536        mutex_unlock(&uctxt->exp_mutex);
 537
 538        kfree(tidinfo);
 539        return ret;
 540}
 541
 542int hfi1_user_exp_rcv_invalid(struct hfi1_filedata *fd,
 543                              struct hfi1_tid_info *tinfo)
 544{
 545        struct hfi1_ctxtdata *uctxt = fd->uctxt;
 546        unsigned long *ev = uctxt->dd->events +
 547                (uctxt_offset(uctxt) + fd->subctxt);
 548        u32 *array;
 549        int ret = 0;
 550
 551        /*
 552         * copy_to_user() can sleep, which will leave the invalid_lock
 553         * locked and cause the MMU notifier to be blocked on the lock
 554         * for a long time.
 555         * Copy the data to a local buffer so we can release the lock.
 556         */
 557        array = kcalloc(uctxt->expected_count, sizeof(*array), GFP_KERNEL);
 558        if (!array)
 559                return -EFAULT;
 560
 561        spin_lock(&fd->invalid_lock);
 562        if (fd->invalid_tid_idx) {
 563                memcpy(array, fd->invalid_tids, sizeof(*array) *
 564                       fd->invalid_tid_idx);
 565                memset(fd->invalid_tids, 0, sizeof(*fd->invalid_tids) *
 566                       fd->invalid_tid_idx);
 567                tinfo->tidcnt = fd->invalid_tid_idx;
 568                fd->invalid_tid_idx = 0;
 569                /*
 570                 * Reset the user flag while still holding the lock.
 571                 * Otherwise, PSM can miss events.
 572                 */
 573                clear_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
 574        } else {
 575                tinfo->tidcnt = 0;
 576        }
 577        spin_unlock(&fd->invalid_lock);
 578
 579        if (tinfo->tidcnt) {
 580                if (copy_to_user((void __user *)tinfo->tidlist,
 581                                 array, sizeof(*array) * tinfo->tidcnt))
 582                        ret = -EFAULT;
 583        }
 584        kfree(array);
 585
 586        return ret;
 587}
 588
 589static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages)
 590{
 591        unsigned pagecount, pageidx, setcount = 0, i;
 592        unsigned long pfn, this_pfn;
 593        struct page **pages = tidbuf->pages;
 594        struct tid_pageset *list = tidbuf->psets;
 595
 596        if (!npages)
 597                return 0;
 598
 599        /*
 600         * Look for sets of physically contiguous pages in the user buffer.
 601         * This will allow us to optimize Expected RcvArray entry usage by
 602         * using the bigger supported sizes.
 603         */
 604        pfn = page_to_pfn(pages[0]);
 605        for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
 606                this_pfn = i < npages ? page_to_pfn(pages[i]) : 0;
 607
 608                /*
 609                 * If the pfn's are not sequential, pages are not physically
 610                 * contiguous.
 611                 */
 612                if (this_pfn != ++pfn) {
 613                        /*
 614                         * At this point we have to loop over the set of
 615                         * physically contiguous pages and break them down it
 616                         * sizes supported by the HW.
 617                         * There are two main constraints:
 618                         *     1. The max buffer size is MAX_EXPECTED_BUFFER.
 619                         *        If the total set size is bigger than that
 620                         *        program only a MAX_EXPECTED_BUFFER chunk.
 621                         *     2. The buffer size has to be a power of two. If
 622                         *        it is not, round down to the closes power of
 623                         *        2 and program that size.
 624                         */
 625                        while (pagecount) {
 626                                int maxpages = pagecount;
 627                                u32 bufsize = pagecount * PAGE_SIZE;
 628
 629                                if (bufsize > MAX_EXPECTED_BUFFER)
 630                                        maxpages =
 631                                                MAX_EXPECTED_BUFFER >>
 632                                                PAGE_SHIFT;
 633                                else if (!is_power_of_2(bufsize))
 634                                        maxpages =
 635                                                rounddown_pow_of_two(bufsize) >>
 636                                                PAGE_SHIFT;
 637
 638                                list[setcount].idx = pageidx;
 639                                list[setcount].count = maxpages;
 640                                pagecount -= maxpages;
 641                                pageidx += maxpages;
 642                                setcount++;
 643                        }
 644                        pageidx = i;
 645                        pagecount = 1;
 646                        pfn = this_pfn;
 647                } else {
 648                        pagecount++;
 649                }
 650        }
 651        return setcount;
 652}
 653
 654/**
 655 * program_rcvarray() - program an RcvArray group with receive buffers
 656 * @fd: filedata pointer
 657 * @tbuf: pointer to struct tid_user_buf that has the user buffer starting
 658 *        virtual address, buffer length, page pointers, pagesets (array of
 659 *        struct tid_pageset holding information on physically contiguous
 660 *        chunks from the user buffer), and other fields.
 661 * @grp: RcvArray group
 662 * @start: starting index into sets array
 663 * @count: number of struct tid_pageset's to program
 664 * @tidlist: the array of u32 elements when the information about the
 665 *           programmed RcvArray entries is to be encoded.
 666 * @tididx: starting offset into tidlist
 667 * @pmapped: (output parameter) number of pages programmed into the RcvArray
 668 *           entries.
 669 *
 670 * This function will program up to 'count' number of RcvArray entries from the
 671 * group 'grp'. To make best use of write-combining writes, the function will
 672 * perform writes to the unused RcvArray entries which will be ignored by the
 673 * HW. Each RcvArray entry will be programmed with a physically contiguous
 674 * buffer chunk from the user's virtual buffer.
 675 *
 676 * Return:
 677 * -EINVAL if the requested count is larger than the size of the group,
 678 * -ENOMEM or -EFAULT on error from set_rcvarray_entry(), or
 679 * number of RcvArray entries programmed.
 680 */
 681static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *tbuf,
 682                            struct tid_group *grp,
 683                            unsigned int start, u16 count,
 684                            u32 *tidlist, unsigned int *tididx,
 685                            unsigned int *pmapped)
 686{
 687        struct hfi1_ctxtdata *uctxt = fd->uctxt;
 688        struct hfi1_devdata *dd = uctxt->dd;
 689        u16 idx;
 690        u32 tidinfo = 0, rcventry, useidx = 0;
 691        int mapped = 0;
 692
 693        /* Count should never be larger than the group size */
 694        if (count > grp->size)
 695                return -EINVAL;
 696
 697        /* Find the first unused entry in the group */
 698        for (idx = 0; idx < grp->size; idx++) {
 699                if (!(grp->map & (1 << idx))) {
 700                        useidx = idx;
 701                        break;
 702                }
 703                rcv_array_wc_fill(dd, grp->base + idx);
 704        }
 705
 706        idx = 0;
 707        while (idx < count) {
 708                u16 npages, pageidx, setidx = start + idx;
 709                int ret = 0;
 710
 711                /*
 712                 * If this entry in the group is used, move to the next one.
 713                 * If we go past the end of the group, exit the loop.
 714                 */
 715                if (useidx >= grp->size) {
 716                        break;
 717                } else if (grp->map & (1 << useidx)) {
 718                        rcv_array_wc_fill(dd, grp->base + useidx);
 719                        useidx++;
 720                        continue;
 721                }
 722
 723                rcventry = grp->base + useidx;
 724                npages = tbuf->psets[setidx].count;
 725                pageidx = tbuf->psets[setidx].idx;
 726
 727                ret = set_rcvarray_entry(fd, tbuf,
 728                                         rcventry, grp, pageidx,
 729                                         npages);
 730                if (ret)
 731                        return ret;
 732                mapped += npages;
 733
 734                tidinfo = rcventry2tidinfo(rcventry - uctxt->expected_base) |
 735                        EXP_TID_SET(LEN, npages);
 736                tidlist[(*tididx)++] = tidinfo;
 737                grp->used++;
 738                grp->map |= 1 << useidx++;
 739                idx++;
 740        }
 741
 742        /* Fill the rest of the group with "blank" writes */
 743        for (; useidx < grp->size; useidx++)
 744                rcv_array_wc_fill(dd, grp->base + useidx);
 745        *pmapped = mapped;
 746        return idx;
 747}
 748
 749static int set_rcvarray_entry(struct hfi1_filedata *fd,
 750                              struct tid_user_buf *tbuf,
 751                              u32 rcventry, struct tid_group *grp,
 752                              u16 pageidx, unsigned int npages)
 753{
 754        int ret;
 755        struct hfi1_ctxtdata *uctxt = fd->uctxt;
 756        struct tid_rb_node *node;
 757        struct hfi1_devdata *dd = uctxt->dd;
 758        dma_addr_t phys;
 759        struct page **pages = tbuf->pages + pageidx;
 760
 761        /*
 762         * Allocate the node first so we can handle a potential
 763         * failure before we've programmed anything.
 764         */
 765        node = kzalloc(sizeof(*node) + (sizeof(struct page *) * npages),
 766                       GFP_KERNEL);
 767        if (!node)
 768                return -ENOMEM;
 769
 770        phys = pci_map_single(dd->pcidev,
 771                              __va(page_to_phys(pages[0])),
 772                              npages * PAGE_SIZE, PCI_DMA_FROMDEVICE);
 773        if (dma_mapping_error(&dd->pcidev->dev, phys)) {
 774                dd_dev_err(dd, "Failed to DMA map Exp Rcv pages 0x%llx\n",
 775                           phys);
 776                kfree(node);
 777                return -EFAULT;
 778        }
 779
 780        node->mmu.addr = tbuf->vaddr + (pageidx * PAGE_SIZE);
 781        node->mmu.len = npages * PAGE_SIZE;
 782        node->phys = page_to_phys(pages[0]);
 783        node->npages = npages;
 784        node->rcventry = rcventry;
 785        node->dma_addr = phys;
 786        node->grp = grp;
 787        node->freed = false;
 788        memcpy(node->pages, pages, sizeof(struct page *) * npages);
 789
 790        if (!fd->handler)
 791                ret = tid_rb_insert(fd, &node->mmu);
 792        else
 793                ret = hfi1_mmu_rb_insert(fd->handler, &node->mmu);
 794
 795        if (ret) {
 796                hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d",
 797                          node->rcventry, node->mmu.addr, node->phys, ret);
 798                pci_unmap_single(dd->pcidev, phys, npages * PAGE_SIZE,
 799                                 PCI_DMA_FROMDEVICE);
 800                kfree(node);
 801                return -EFAULT;
 802        }
 803        hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1);
 804        trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry, npages,
 805                               node->mmu.addr, node->phys, phys);
 806        return 0;
 807}
 808
 809static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
 810                              struct tid_group **grp)
 811{
 812        struct hfi1_ctxtdata *uctxt = fd->uctxt;
 813        struct hfi1_devdata *dd = uctxt->dd;
 814        struct tid_rb_node *node;
 815        u8 tidctrl = EXP_TID_GET(tidinfo, CTRL);
 816        u32 tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry;
 817
 818        if (tididx >= uctxt->expected_count) {
 819                dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n",
 820                           tididx, uctxt->ctxt);
 821                return -EINVAL;
 822        }
 823
 824        if (tidctrl == 0x3)
 825                return -EINVAL;
 826
 827        rcventry = tididx + (tidctrl - 1);
 828
 829        node = fd->entry_to_rb[rcventry];
 830        if (!node || node->rcventry != (uctxt->expected_base + rcventry))
 831                return -EBADF;
 832
 833        if (grp)
 834                *grp = node->grp;
 835
 836        if (!fd->handler)
 837                cacheless_tid_rb_remove(fd, node);
 838        else
 839                hfi1_mmu_rb_remove(fd->handler, &node->mmu);
 840
 841        return 0;
 842}
 843
 844static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
 845{
 846        struct hfi1_ctxtdata *uctxt = fd->uctxt;
 847        struct hfi1_devdata *dd = uctxt->dd;
 848
 849        trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry,
 850                                 node->npages, node->mmu.addr, node->phys,
 851                                 node->dma_addr);
 852
 853        /*
 854         * Make sure device has seen the write before we unpin the
 855         * pages.
 856         */
 857        hfi1_put_tid(dd, node->rcventry, PT_INVALID_FLUSH, 0, 0);
 858
 859        unpin_rcv_pages(fd, NULL, node, 0, node->npages, true);
 860
 861        node->grp->used--;
 862        node->grp->map &= ~(1 << (node->rcventry - node->grp->base));
 863
 864        if (node->grp->used == node->grp->size - 1)
 865                tid_group_move(node->grp, &uctxt->tid_full_list,
 866                               &uctxt->tid_used_list);
 867        else if (!node->grp->used)
 868                tid_group_move(node->grp, &uctxt->tid_used_list,
 869                               &uctxt->tid_group_list);
 870        kfree(node);
 871}
 872
 873/*
 874 * As a simple helper for hfi1_user_exp_rcv_free, this function deals with
 875 * clearing nodes in the non-cached case.
 876 */
 877static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
 878                            struct exp_tid_set *set,
 879                            struct hfi1_filedata *fd)
 880{
 881        struct tid_group *grp, *ptr;
 882        int i;
 883
 884        list_for_each_entry_safe(grp, ptr, &set->list, list) {
 885                list_del_init(&grp->list);
 886
 887                for (i = 0; i < grp->size; i++) {
 888                        if (grp->map & (1 << i)) {
 889                                u16 rcventry = grp->base + i;
 890                                struct tid_rb_node *node;
 891
 892                                node = fd->entry_to_rb[rcventry -
 893                                                          uctxt->expected_base];
 894                                if (!node || node->rcventry != rcventry)
 895                                        continue;
 896
 897                                cacheless_tid_rb_remove(fd, node);
 898                        }
 899                }
 900        }
 901}
 902
 903/*
 904 * Always return 0 from this function.  A non-zero return indicates that the
 905 * remove operation will be called and that memory should be unpinned.
 906 * However, the driver cannot unpin out from under PSM.  Instead, retain the
 907 * memory (by returning 0) and inform PSM that the memory is going away.  PSM
 908 * will call back later when it has removed the memory from its list.
 909 */
 910static int tid_rb_invalidate(void *arg, struct mmu_rb_node *mnode)
 911{
 912        struct hfi1_filedata *fdata = arg;
 913        struct hfi1_ctxtdata *uctxt = fdata->uctxt;
 914        struct tid_rb_node *node =
 915                container_of(mnode, struct tid_rb_node, mmu);
 916
 917        if (node->freed)
 918                return 0;
 919
 920        trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt, node->mmu.addr,
 921                                 node->rcventry, node->npages, node->dma_addr);
 922        node->freed = true;
 923
 924        spin_lock(&fdata->invalid_lock);
 925        if (fdata->invalid_tid_idx < uctxt->expected_count) {
 926                fdata->invalid_tids[fdata->invalid_tid_idx] =
 927                        rcventry2tidinfo(node->rcventry - uctxt->expected_base);
 928                fdata->invalid_tids[fdata->invalid_tid_idx] |=
 929                        EXP_TID_SET(LEN, node->npages);
 930                if (!fdata->invalid_tid_idx) {
 931                        unsigned long *ev;
 932
 933                        /*
 934                         * hfi1_set_uevent_bits() sets a user event flag
 935                         * for all processes. Because calling into the
 936                         * driver to process TID cache invalidations is
 937                         * expensive and TID cache invalidations are
 938                         * handled on a per-process basis, we can
 939                         * optimize this to set the flag only for the
 940                         * process in question.
 941                         */
 942                        ev = uctxt->dd->events +
 943                                (uctxt_offset(uctxt) + fdata->subctxt);
 944                        set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
 945                }
 946                fdata->invalid_tid_idx++;
 947        }
 948        spin_unlock(&fdata->invalid_lock);
 949        return 0;
 950}
 951
 952static int tid_rb_insert(void *arg, struct mmu_rb_node *node)
 953{
 954        struct hfi1_filedata *fdata = arg;
 955        struct tid_rb_node *tnode =
 956                container_of(node, struct tid_rb_node, mmu);
 957        u32 base = fdata->uctxt->expected_base;
 958
 959        fdata->entry_to_rb[tnode->rcventry - base] = tnode;
 960        return 0;
 961}
 962
 963static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
 964                                    struct tid_rb_node *tnode)
 965{
 966        u32 base = fdata->uctxt->expected_base;
 967
 968        fdata->entry_to_rb[tnode->rcventry - base] = NULL;
 969        clear_tid_node(fdata, tnode);
 970}
 971
 972static void tid_rb_remove(void *arg, struct mmu_rb_node *node)
 973{
 974        struct hfi1_filedata *fdata = arg;
 975        struct tid_rb_node *tnode =
 976                container_of(node, struct tid_rb_node, mmu);
 977
 978        cacheless_tid_rb_remove(fdata, tnode);
 979}
 980