LXR linux/drivers/infiniband/hw/hfi1/user_exp

   1/*
   2 * Copyright(c) 2015, 2016 Intel Corporation.
   3 *
   4 * This file is provided under a dual BSD/GPLv2 license.  When using or
   5 * redistributing this file, you may do so under either license.
   6 *
   7 * GPL LICENSE SUMMARY
   8 *
   9 * This program is free software; you can redistribute it and/or modify
  10 * it under the terms of version 2 of the GNU General Public License as
  11 * published by the Free Software Foundation.
  12 *
  13 * This program is distributed in the hope that it will be useful, but
  14 * WITHOUT ANY WARRANTY; without even the implied warranty of
  15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16 * General Public License for more details.
  17 *
  18 * BSD LICENSE
  19 *
  20 * Redistribution and use in source and binary forms, with or without
  21 * modification, are permitted provided that the following conditions
  22 * are met:
  23 *
  24 *  - Redistributions of source code must retain the above copyright
  25 *    notice, this list of conditions and the following disclaimer.
  26 *  - Redistributions in binary form must reproduce the above copyright
  27 *    notice, this list of conditions and the following disclaimer in
  28 *    the documentation and/or other materials provided with the
  29 *    distribution.
  30 *  - Neither the name of Intel Corporation nor the names of its
  31 *    contributors may be used to endorse or promote products derived
  32 *    from this software without specific prior written permission.
  33 *
  34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  45 *
  46 */
  47#include <asm/page.h>
  48
  49#include "user_exp_rcv.h"
  50#include "trace.h"
  51#include "mmu_rb.h"
  52
  53struct tid_group {
  54        struct list_head list;
  55        unsigned base;
  56        u8 size;
  57        u8 used;
  58        u8 map;
  59};
  60
  61struct tid_rb_node {
  62        struct mmu_rb_node mmu;
  63        unsigned long phys;
  64        struct tid_group *grp;
  65        u32 rcventry;
  66        dma_addr_t dma_addr;
  67        bool freed;
  68        unsigned npages;
  69        struct page *pages[0];
  70};
  71
  72struct tid_pageset {
  73        u16 idx;
  74        u16 count;
  75};
  76
  77#define EXP_TID_SET_EMPTY(set) (set.count == 0 && list_empty(&set.list))
  78
  79#define num_user_pages(vaddr, len)                                     \
  80        (1 + (((((unsigned long)(vaddr) +                              \
  81                 (unsigned long)(len) - 1) & PAGE_MASK) -              \
  82               ((unsigned long)vaddr & PAGE_MASK)) >> PAGE_SHIFT))
  83
  84static void unlock_exp_tids(struct hfi1_ctxtdata *, struct exp_tid_set *,
  85                            struct hfi1_filedata *);
  86static u32 find_phys_blocks(struct page **, unsigned, struct tid_pageset *);
  87static int set_rcvarray_entry(struct file *, unsigned long, u32,
  88                              struct tid_group *, struct page **, unsigned);
  89static int tid_rb_insert(void *, struct mmu_rb_node *);
  90static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
  91                                    struct tid_rb_node *tnode);
  92static void tid_rb_remove(void *, struct mmu_rb_node *);
  93static int tid_rb_invalidate(void *, struct mmu_rb_node *);
  94static int program_rcvarray(struct file *, unsigned long, struct tid_group *,
  95                            struct tid_pageset *, unsigned, u16, struct page **,
  96                            u32 *, unsigned *, unsigned *);
  97static int unprogram_rcvarray(struct file *, u32, struct tid_group **);
  98static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node);
  99
 100static struct mmu_rb_ops tid_rb_ops = {
 101        .insert = tid_rb_insert,
 102        .remove = tid_rb_remove,
 103        .invalidate = tid_rb_invalidate
 104};
 105
 106static inline u32 rcventry2tidinfo(u32 rcventry)
 107{
 108        u32 pair = rcventry & ~0x1;
 109
 110        return EXP_TID_SET(IDX, pair >> 1) |
 111                EXP_TID_SET(CTRL, 1 << (rcventry - pair));
 112}
 113
 114static inline void exp_tid_group_init(struct exp_tid_set *set)
 115{
 116        INIT_LIST_HEAD(&set->list);
 117        set->count = 0;
 118}
 119
 120static inline void tid_group_remove(struct tid_group *grp,
 121                                    struct exp_tid_set *set)
 122{
 123        list_del_init(&grp->list);
 124        set->count--;
 125}
 126
 127static inline void tid_group_add_tail(struct tid_group *grp,
 128                                      struct exp_tid_set *set)
 129{
 130        list_add_tail(&grp->list, &set->list);
 131        set->count++;
 132}
 133
 134static inline struct tid_group *tid_group_pop(struct exp_tid_set *set)
 135{
 136        struct tid_group *grp =
 137                list_first_entry(&set->list, struct tid_group, list);
 138        list_del_init(&grp->list);
 139        set->count--;
 140        return grp;
 141}
 142
 143static inline void tid_group_move(struct tid_group *group,
 144                                  struct exp_tid_set *s1,
 145                                  struct exp_tid_set *s2)
 146{
 147        tid_group_remove(group, s1);
 148        tid_group_add_tail(group, s2);
 149}
 150
 151/*
 152 * Initialize context and file private data needed for Expected
 153 * receive caching. This needs to be done after the context has
 154 * been configured with the eager/expected RcvEntry counts.
 155 */
 156int hfi1_user_exp_rcv_init(struct file *fp)
 157{
 158        struct hfi1_filedata *fd = fp->private_data;
 159        struct hfi1_ctxtdata *uctxt = fd->uctxt;
 160        struct hfi1_devdata *dd = uctxt->dd;
 161        unsigned tidbase;
 162        int i, ret = 0;
 163
 164        spin_lock_init(&fd->tid_lock);
 165        spin_lock_init(&fd->invalid_lock);
 166
 167        if (!uctxt->subctxt_cnt || !fd->subctxt) {
 168                exp_tid_group_init(&uctxt->tid_group_list);
 169                exp_tid_group_init(&uctxt->tid_used_list);
 170                exp_tid_group_init(&uctxt->tid_full_list);
 171
 172                tidbase = uctxt->expected_base;
 173                for (i = 0; i < uctxt->expected_count /
 174                             dd->rcv_entries.group_size; i++) {
 175                        struct tid_group *grp;
 176
 177                        grp = kzalloc(sizeof(*grp), GFP_KERNEL);
 178                        if (!grp) {
 179                                /*
 180                                 * If we fail here, the groups already
 181                                 * allocated will be freed by the close
 182                                 * call.
 183                                 */
 184                                ret = -ENOMEM;
 185                                goto done;
 186                        }
 187                        grp->size = dd->rcv_entries.group_size;
 188                        grp->base = tidbase;
 189                        tid_group_add_tail(grp, &uctxt->tid_group_list);
 190                        tidbase += dd->rcv_entries.group_size;
 191                }
 192        }
 193
 194        fd->entry_to_rb = kcalloc(uctxt->expected_count,
 195                                     sizeof(struct rb_node *),
 196                                     GFP_KERNEL);
 197        if (!fd->entry_to_rb)
 198                return -ENOMEM;
 199
 200        if (!HFI1_CAP_UGET_MASK(uctxt->flags, TID_UNMAP)) {
 201                fd->invalid_tid_idx = 0;
 202                fd->invalid_tids = kzalloc(uctxt->expected_count *
 203                                           sizeof(u32), GFP_KERNEL);
 204                if (!fd->invalid_tids) {
 205                        ret = -ENOMEM;
 206                        goto done;
 207                }
 208
 209                /*
 210                 * Register MMU notifier callbacks. If the registration
 211                 * fails, continue without TID caching for this context.
 212                 */
 213                ret = hfi1_mmu_rb_register(fd, fd->mm, &tid_rb_ops,
 214                                           dd->pport->hfi1_wq,
 215                                           &fd->handler);
 216                if (ret) {
 217                        dd_dev_info(dd,
 218                                    "Failed MMU notifier registration %d\n",
 219                                    ret);
 220                        ret = 0;
 221                }
 222        }
 223
 224        /*
 225         * PSM does not have a good way to separate, count, and
 226         * effectively enforce a limit on RcvArray entries used by
 227         * subctxts (when context sharing is used) when TID caching
 228         * is enabled. To help with that, we calculate a per-process
 229         * RcvArray entry share and enforce that.
 230         * If TID caching is not in use, PSM deals with usage on its
 231         * own. In that case, we allow any subctxt to take all of the
 232         * entries.
 233         *
 234         * Make sure that we set the tid counts only after successful
 235         * init.
 236         */
 237        spin_lock(&fd->tid_lock);
 238        if (uctxt->subctxt_cnt && fd->handler) {
 239                u16 remainder;
 240
 241                fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt;
 242                remainder = uctxt->expected_count % uctxt->subctxt_cnt;
 243                if (remainder && fd->subctxt < remainder)
 244                        fd->tid_limit++;
 245        } else {
 246                fd->tid_limit = uctxt->expected_count;
 247        }
 248        spin_unlock(&fd->tid_lock);
 249done:
 250        return ret;
 251}
 252
 253int hfi1_user_exp_rcv_free(struct hfi1_filedata *fd)
 254{
 255        struct hfi1_ctxtdata *uctxt = fd->uctxt;
 256        struct tid_group *grp, *gptr;
 257
 258        if (!test_bit(HFI1_CTXT_SETUP_DONE, &uctxt->event_flags))
 259                return 0;
 260        /*
 261         * The notifier would have been removed when the process'es mm
 262         * was freed.
 263         */
 264        if (fd->handler)
 265                hfi1_mmu_rb_unregister(fd->handler);
 266
 267        kfree(fd->invalid_tids);
 268
 269        if (!uctxt->cnt) {
 270                if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list))
 271                        unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd);
 272                if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list))
 273                        unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd);
 274                list_for_each_entry_safe(grp, gptr, &uctxt->tid_group_list.list,
 275                                         list) {
 276                        list_del_init(&grp->list);
 277                        kfree(grp);
 278                }
 279                hfi1_clear_tids(uctxt);
 280        }
 281
 282        kfree(fd->entry_to_rb);
 283        return 0;
 284}
 285
 286/*
 287 * Write an "empty" RcvArray entry.
 288 * This function exists so the TID registaration code can use it
 289 * to write to unused/unneeded entries and still take advantage
 290 * of the WC performance improvements. The HFI will ignore this
 291 * write to the RcvArray entry.
 292 */
 293static inline void rcv_array_wc_fill(struct hfi1_devdata *dd, u32 index)
 294{
 295        /*
 296         * Doing the WC fill writes only makes sense if the device is
 297         * present and the RcvArray has been mapped as WC memory.
 298         */
 299        if ((dd->flags & HFI1_PRESENT) && dd->rcvarray_wc)
 300                writeq(0, dd->rcvarray_wc + (index * 8));
 301}
 302
 303/*
 304 * RcvArray entry allocation for Expected Receives is done by the
 305 * following algorithm:
 306 *
 307 * The context keeps 3 lists of groups of RcvArray entries:
 308 *   1. List of empty groups - tid_group_list
 309 *      This list is created during user context creation and
 310 *      contains elements which describe sets (of 8) of empty
 311 *      RcvArray entries.
 312 *   2. List of partially used groups - tid_used_list
 313 *      This list contains sets of RcvArray entries which are
 314 *      not completely used up. Another mapping request could
 315 *      use some of all of the remaining entries.
 316 *   3. List of full groups - tid_full_list
 317 *      This is the list where sets that are completely used
 318 *      up go.
 319 *
 320 * An attempt to optimize the usage of RcvArray entries is
 321 * made by finding all sets of physically contiguous pages in a
 322 * user's buffer.
 323 * These physically contiguous sets are further split into
 324 * sizes supported by the receive engine of the HFI. The
 325 * resulting sets of pages are stored in struct tid_pageset,
 326 * which describes the sets as:
 327 *    * .count - number of pages in this set
 328 *    * .idx - starting index into struct page ** array
 329 *                    of this set
 330 *
 331 * From this point on, the algorithm deals with the page sets
 332 * described above. The number of pagesets is divided by the
 333 * RcvArray group size to produce the number of full groups
 334 * needed.
 335 *
 336 * Groups from the 3 lists are manipulated using the following
 337 * rules:
 338 *   1. For each set of 8 pagesets, a complete group from
 339 *      tid_group_list is taken, programmed, and moved to
 340 *      the tid_full_list list.
 341 *   2. For all remaining pagesets:
 342 *      2.1 If the tid_used_list is empty and the tid_group_list
 343 *          is empty, stop processing pageset and return only
 344 *          what has been programmed up to this point.
 345 *      2.2 If the tid_used_list is empty and the tid_group_list
 346 *          is not empty, move a group from tid_group_list to
 347 *          tid_used_list.
 348 *      2.3 For each group is tid_used_group, program as much as
 349 *          can fit into the group. If the group becomes fully
 350 *          used, move it to tid_full_list.
 351 */
 352int hfi1_user_exp_rcv_setup(struct file *fp, struct hfi1_tid_info *tinfo)
 353{
 354        int ret = 0, need_group = 0, pinned;
 355        struct hfi1_filedata *fd = fp->private_data;
 356        struct hfi1_ctxtdata *uctxt = fd->uctxt;
 357        struct hfi1_devdata *dd = uctxt->dd;
 358        unsigned npages, ngroups, pageidx = 0, pageset_count, npagesets,
 359                tididx = 0, mapped, mapped_pages = 0;
 360        unsigned long vaddr = tinfo->vaddr;
 361        struct page **pages = NULL;
 362        u32 *tidlist = NULL;
 363        struct tid_pageset *pagesets = NULL;
 364
 365        /* Get the number of pages the user buffer spans */
 366        npages = num_user_pages(vaddr, tinfo->length);
 367        if (!npages)
 368                return -EINVAL;
 369
 370        if (npages > uctxt->expected_count) {
 371                dd_dev_err(dd, "Expected buffer too big\n");
 372                return -EINVAL;
 373        }
 374
 375        /* Verify that access is OK for the user buffer */
 376        if (!access_ok(VERIFY_WRITE, (void __user *)vaddr,
 377                       npages * PAGE_SIZE)) {
 378                dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n",
 379                           (void *)vaddr, npages);
 380                return -EFAULT;
 381        }
 382
 383        pagesets = kcalloc(uctxt->expected_count, sizeof(*pagesets),
 384                           GFP_KERNEL);
 385        if (!pagesets)
 386                return -ENOMEM;
 387
 388        /* Allocate the array of struct page pointers needed for pinning */
 389        pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
 390        if (!pages) {
 391                ret = -ENOMEM;
 392                goto bail;
 393        }
 394
 395        /*
 396         * Pin all the pages of the user buffer. If we can't pin all the
 397         * pages, accept the amount pinned so far and program only that.
 398         * User space knows how to deal with partially programmed buffers.
 399         */
 400        if (!hfi1_can_pin_pages(dd, fd->mm, fd->tid_n_pinned, npages)) {
 401                ret = -ENOMEM;
 402                goto bail;
 403        }
 404
 405        pinned = hfi1_acquire_user_pages(fd->mm, vaddr, npages, true, pages);
 406        if (pinned <= 0) {
 407                ret = pinned;
 408                goto bail;
 409        }
 410        fd->tid_n_pinned += npages;
 411
 412        /* Find sets of physically contiguous pages */
 413        npagesets = find_phys_blocks(pages, pinned, pagesets);
 414
 415        /*
 416         * We don't need to access this under a lock since tid_used is per
 417         * process and the same process cannot be in hfi1_user_exp_rcv_clear()
 418         * and hfi1_user_exp_rcv_setup() at the same time.
 419         */
 420        spin_lock(&fd->tid_lock);
 421        if (fd->tid_used + npagesets > fd->tid_limit)
 422                pageset_count = fd->tid_limit - fd->tid_used;
 423        else
 424                pageset_count = npagesets;
 425        spin_unlock(&fd->tid_lock);
 426
 427        if (!pageset_count)
 428                goto bail;
 429
 430        ngroups = pageset_count / dd->rcv_entries.group_size;
 431        tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL);
 432        if (!tidlist) {
 433                ret = -ENOMEM;
 434                goto nomem;
 435        }
 436
 437        tididx = 0;
 438
 439        /*
 440         * From this point on, we are going to be using shared (between master
 441         * and subcontexts) context resources. We need to take the lock.
 442         */
 443        mutex_lock(&uctxt->exp_lock);
 444        /*
 445         * The first step is to program the RcvArray entries which are complete
 446         * groups.
 447         */
 448        while (ngroups && uctxt->tid_group_list.count) {
 449                struct tid_group *grp =
 450                        tid_group_pop(&uctxt->tid_group_list);
 451
 452                ret = program_rcvarray(fp, vaddr, grp, pagesets,
 453                                       pageidx, dd->rcv_entries.group_size,
 454                                       pages, tidlist, &tididx, &mapped);
 455                /*
 456                 * If there was a failure to program the RcvArray
 457                 * entries for the entire group, reset the grp fields
 458                 * and add the grp back to the free group list.
 459                 */
 460                if (ret <= 0) {
 461                        tid_group_add_tail(grp, &uctxt->tid_group_list);
 462                        hfi1_cdbg(TID,
 463                                  "Failed to program RcvArray group %d", ret);
 464                        goto unlock;
 465                }
 466
 467                tid_group_add_tail(grp, &uctxt->tid_full_list);
 468                ngroups--;
 469                pageidx += ret;
 470                mapped_pages += mapped;
 471        }
 472
 473        while (pageidx < pageset_count) {
 474                struct tid_group *grp, *ptr;
 475                /*
 476                 * If we don't have any partially used tid groups, check
 477                 * if we have empty groups. If so, take one from there and
 478                 * put in the partially used list.
 479                 */
 480                if (!uctxt->tid_used_list.count || need_group) {
 481                        if (!uctxt->tid_group_list.count)
 482                                goto unlock;
 483
 484                        grp = tid_group_pop(&uctxt->tid_group_list);
 485                        tid_group_add_tail(grp, &uctxt->tid_used_list);
 486                        need_group = 0;
 487                }
 488                /*
 489                 * There is an optimization opportunity here - instead of
 490                 * fitting as many page sets as we can, check for a group
 491                 * later on in the list that could fit all of them.
 492                 */
 493                list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list,
 494                                         list) {
 495                        unsigned use = min_t(unsigned, pageset_count - pageidx,
 496                                             grp->size - grp->used);
 497
 498                        ret = program_rcvarray(fp, vaddr, grp, pagesets,
 499                                               pageidx, use, pages, tidlist,
 500                                               &tididx, &mapped);
 501                        if (ret < 0) {
 502                                hfi1_cdbg(TID,
 503                                          "Failed to program RcvArray entries %d",
 504                                          ret);
 505                                ret = -EFAULT;
 506                                goto unlock;
 507                        } else if (ret > 0) {
 508                                if (grp->used == grp->size)
 509                                        tid_group_move(grp,
 510                                                       &uctxt->tid_used_list,
 511                                                       &uctxt->tid_full_list);
 512                                pageidx += ret;
 513                                mapped_pages += mapped;
 514                                need_group = 0;
 515                                /* Check if we are done so we break out early */
 516                                if (pageidx >= pageset_count)
 517                                        break;
 518                        } else if (WARN_ON(ret == 0)) {
 519                                /*
 520                                 * If ret is 0, we did not program any entries
 521                                 * into this group, which can only happen if
 522                                 * we've screwed up the accounting somewhere.
 523                                 * Warn and try to continue.
 524                                 */
 525                                need_group = 1;
 526                        }
 527                }
 528        }
 529unlock:
 530        mutex_unlock(&uctxt->exp_lock);
 531nomem:
 532        hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx,
 533                  mapped_pages, ret);
 534        if (tididx) {
 535                spin_lock(&fd->tid_lock);
 536                fd->tid_used += tididx;
 537                spin_unlock(&fd->tid_lock);
 538                tinfo->tidcnt = tididx;
 539                tinfo->length = mapped_pages * PAGE_SIZE;
 540
 541                if (copy_to_user((void __user *)(unsigned long)tinfo->tidlist,
 542                                 tidlist, sizeof(tidlist[0]) * tididx)) {
 543                        /*
 544                         * On failure to copy to the user level, we need to undo
 545                         * everything done so far so we don't leak resources.
 546                         */
 547                        tinfo->tidlist = (unsigned long)&tidlist;
 548                        hfi1_user_exp_rcv_clear(fp, tinfo);
 549                        tinfo->tidlist = 0;
 550                        ret = -EFAULT;
 551                        goto bail;
 552                }
 553        }
 554
 555        /*
 556         * If not everything was mapped (due to insufficient RcvArray entries,
 557         * for example), unpin all unmapped pages so we can pin them nex time.
 558         */
 559        if (mapped_pages != pinned) {
 560                hfi1_release_user_pages(fd->mm, &pages[mapped_pages],
 561                                        pinned - mapped_pages,
 562                                        false);
 563                fd->tid_n_pinned -= pinned - mapped_pages;
 564        }
 565bail:
 566        kfree(pagesets);
 567        kfree(pages);
 568        kfree(tidlist);
 569        return ret > 0 ? 0 : ret;
 570}
 571
 572int hfi1_user_exp_rcv_clear(struct file *fp, struct hfi1_tid_info *tinfo)
 573{
 574        int ret = 0;
 575        struct hfi1_filedata *fd = fp->private_data;
 576        struct hfi1_ctxtdata *uctxt = fd->uctxt;
 577        u32 *tidinfo;
 578        unsigned tididx;
 579
 580        tidinfo = kcalloc(tinfo->tidcnt, sizeof(*tidinfo), GFP_KERNEL);
 581        if (!tidinfo)
 582                return -ENOMEM;
 583
 584        if (copy_from_user(tidinfo, (void __user *)(unsigned long)
 585                           tinfo->tidlist, sizeof(tidinfo[0]) *
 586                           tinfo->tidcnt)) {
 587                ret = -EFAULT;
 588                goto done;
 589        }
 590
 591        mutex_lock(&uctxt->exp_lock);
 592        for (tididx = 0; tididx < tinfo->tidcnt; tididx++) {
 593                ret = unprogram_rcvarray(fp, tidinfo[tididx], NULL);
 594                if (ret) {
 595                        hfi1_cdbg(TID, "Failed to unprogram rcv array %d",
 596                                  ret);
 597                        break;
 598                }
 599        }
 600        spin_lock(&fd->tid_lock);
 601        fd->tid_used -= tididx;
 602        spin_unlock(&fd->tid_lock);
 603        tinfo->tidcnt = tididx;
 604        mutex_unlock(&uctxt->exp_lock);
 605done:
 606        kfree(tidinfo);
 607        return ret;
 608}
 609
 610int hfi1_user_exp_rcv_invalid(struct file *fp, struct hfi1_tid_info *tinfo)
 611{
 612        struct hfi1_filedata *fd = fp->private_data;
 613        struct hfi1_ctxtdata *uctxt = fd->uctxt;
 614        unsigned long *ev = uctxt->dd->events +
 615                (((uctxt->ctxt - uctxt->dd->first_user_ctxt) *
 616                  HFI1_MAX_SHARED_CTXTS) + fd->subctxt);
 617        u32 *array;
 618        int ret = 0;
 619
 620        if (!fd->invalid_tids)
 621                return -EINVAL;
 622
 623        /*
 624         * copy_to_user() can sleep, which will leave the invalid_lock
 625         * locked and cause the MMU notifier to be blocked on the lock
 626         * for a long time.
 627         * Copy the data to a local buffer so we can release the lock.
 628         */
 629        array = kcalloc(uctxt->expected_count, sizeof(*array), GFP_KERNEL);
 630        if (!array)
 631                return -EFAULT;
 632
 633        spin_lock(&fd->invalid_lock);
 634        if (fd->invalid_tid_idx) {
 635                memcpy(array, fd->invalid_tids, sizeof(*array) *
 636                       fd->invalid_tid_idx);
 637                memset(fd->invalid_tids, 0, sizeof(*fd->invalid_tids) *
 638                       fd->invalid_tid_idx);
 639                tinfo->tidcnt = fd->invalid_tid_idx;
 640                fd->invalid_tid_idx = 0;
 641                /*
 642                 * Reset the user flag while still holding the lock.
 643                 * Otherwise, PSM can miss events.
 644                 */
 645                clear_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
 646        } else {
 647                tinfo->tidcnt = 0;
 648        }
 649        spin_unlock(&fd->invalid_lock);
 650
 651        if (tinfo->tidcnt) {
 652                if (copy_to_user((void __user *)tinfo->tidlist,
 653                                 array, sizeof(*array) * tinfo->tidcnt))
 654                        ret = -EFAULT;
 655        }
 656        kfree(array);
 657
 658        return ret;
 659}
 660
 661static u32 find_phys_blocks(struct page **pages, unsigned npages,
 662                            struct tid_pageset *list)
 663{
 664        unsigned pagecount, pageidx, setcount = 0, i;
 665        unsigned long pfn, this_pfn;
 666
 667        if (!npages)
 668                return 0;
 669
 670        /*
 671         * Look for sets of physically contiguous pages in the user buffer.
 672         * This will allow us to optimize Expected RcvArray entry usage by
 673         * using the bigger supported sizes.
 674         */
 675        pfn = page_to_pfn(pages[0]);
 676        for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
 677                this_pfn = i < npages ? page_to_pfn(pages[i]) : 0;
 678
 679                /*
 680                 * If the pfn's are not sequential, pages are not physically
 681                 * contiguous.
 682                 */
 683                if (this_pfn != ++pfn) {
 684                        /*
 685                         * At this point we have to loop over the set of
 686                         * physically contiguous pages and break them down it
 687                         * sizes supported by the HW.
 688                         * There are two main constraints:
 689                         *     1. The max buffer size is MAX_EXPECTED_BUFFER.
 690                         *        If the total set size is bigger than that
 691                         *        program only a MAX_EXPECTED_BUFFER chunk.
 692                         *     2. The buffer size has to be a power of two. If
 693                         *        it is not, round down to the closes power of
 694                         *        2 and program that size.
 695                         */
 696                        while (pagecount) {
 697                                int maxpages = pagecount;
 698                                u32 bufsize = pagecount * PAGE_SIZE;
 699
 700                                if (bufsize > MAX_EXPECTED_BUFFER)
 701                                        maxpages =
 702                                                MAX_EXPECTED_BUFFER >>
 703                                                PAGE_SHIFT;
 704                                else if (!is_power_of_2(bufsize))
 705                                        maxpages =
 706                                                rounddown_pow_of_two(bufsize) >>
 707                                                PAGE_SHIFT;
 708
 709                                list[setcount].idx = pageidx;
 710                                list[setcount].count = maxpages;
 711                                pagecount -= maxpages;
 712                                pageidx += maxpages;
 713                                setcount++;
 714                        }
 715                        pageidx = i;
 716                        pagecount = 1;
 717                        pfn = this_pfn;
 718                } else {
 719                        pagecount++;
 720                }
 721        }
 722        return setcount;
 723}
 724
 725/**
 726 * program_rcvarray() - program an RcvArray group with receive buffers
 727 * @fp: file pointer
 728 * @vaddr: starting user virtual address
 729 * @grp: RcvArray group
 730 * @sets: array of struct tid_pageset holding information on physically
 731 *        contiguous chunks from the user buffer
 732 * @start: starting index into sets array
 733 * @count: number of struct tid_pageset's to program
 734 * @pages: an array of struct page * for the user buffer
 735 * @tidlist: the array of u32 elements when the information about the
 736 *           programmed RcvArray entries is to be encoded.
 737 * @tididx: starting offset into tidlist
 738 * @pmapped: (output parameter) number of pages programmed into the RcvArray
 739 *           entries.
 740 *
 741 * This function will program up to 'count' number of RcvArray entries from the
 742 * group 'grp'. To make best use of write-combining writes, the function will
 743 * perform writes to the unused RcvArray entries which will be ignored by the
 744 * HW. Each RcvArray entry will be programmed with a physically contiguous
 745 * buffer chunk from the user's virtual buffer.
 746 *
 747 * Return:
 748 * -EINVAL if the requested count is larger than the size of the group,
 749 * -ENOMEM or -EFAULT on error from set_rcvarray_entry(), or
 750 * number of RcvArray entries programmed.
 751 */
 752static int program_rcvarray(struct file *fp, unsigned long vaddr,
 753                            struct tid_group *grp,
 754                            struct tid_pageset *sets,
 755                            unsigned start, u16 count, struct page **pages,
 756                            u32 *tidlist, unsigned *tididx, unsigned *pmapped)
 757{
 758        struct hfi1_filedata *fd = fp->private_data;
 759        struct hfi1_ctxtdata *uctxt = fd->uctxt;
 760        struct hfi1_devdata *dd = uctxt->dd;
 761        u16 idx;
 762        u32 tidinfo = 0, rcventry, useidx = 0;
 763        int mapped = 0;
 764
 765        /* Count should never be larger than the group size */
 766        if (count > grp->size)
 767                return -EINVAL;
 768
 769        /* Find the first unused entry in the group */
 770        for (idx = 0; idx < grp->size; idx++) {
 771                if (!(grp->map & (1 << idx))) {
 772                        useidx = idx;
 773                        break;
 774                }
 775                rcv_array_wc_fill(dd, grp->base + idx);
 776        }
 777
 778        idx = 0;
 779        while (idx < count) {
 780                u16 npages, pageidx, setidx = start + idx;
 781                int ret = 0;
 782
 783                /*
 784                 * If this entry in the group is used, move to the next one.
 785                 * If we go past the end of the group, exit the loop.
 786                 */
 787                if (useidx >= grp->size) {
 788                        break;
 789                } else if (grp->map & (1 << useidx)) {
 790                        rcv_array_wc_fill(dd, grp->base + useidx);
 791                        useidx++;
 792                        continue;
 793                }
 794
 795                rcventry = grp->base + useidx;
 796                npages = sets[setidx].count;
 797                pageidx = sets[setidx].idx;
 798
 799                ret = set_rcvarray_entry(fp, vaddr + (pageidx * PAGE_SIZE),
 800                                         rcventry, grp, pages + pageidx,
 801                                         npages);
 802                if (ret)
 803                        return ret;
 804                mapped += npages;
 805
 806                tidinfo = rcventry2tidinfo(rcventry - uctxt->expected_base) |
 807                        EXP_TID_SET(LEN, npages);
 808                tidlist[(*tididx)++] = tidinfo;
 809                grp->used++;
 810                grp->map |= 1 << useidx++;
 811                idx++;
 812        }
 813
 814        /* Fill the rest of the group with "blank" writes */
 815        for (; useidx < grp->size; useidx++)
 816                rcv_array_wc_fill(dd, grp->base + useidx);
 817        *pmapped = mapped;
 818        return idx;
 819}
 820
 821static int set_rcvarray_entry(struct file *fp, unsigned long vaddr,
 822                              u32 rcventry, struct tid_group *grp,
 823                              struct page **pages, unsigned npages)
 824{
 825        int ret;
 826        struct hfi1_filedata *fd = fp->private_data;
 827        struct hfi1_ctxtdata *uctxt = fd->uctxt;
 828        struct tid_rb_node *node;
 829        struct hfi1_devdata *dd = uctxt->dd;
 830        dma_addr_t phys;
 831
 832        /*
 833         * Allocate the node first so we can handle a potential
 834         * failure before we've programmed anything.
 835         */
 836        node = kzalloc(sizeof(*node) + (sizeof(struct page *) * npages),
 837                       GFP_KERNEL);
 838        if (!node)
 839                return -ENOMEM;
 840
 841        phys = pci_map_single(dd->pcidev,
 842                              __va(page_to_phys(pages[0])),
 843                              npages * PAGE_SIZE, PCI_DMA_FROMDEVICE);
 844        if (dma_mapping_error(&dd->pcidev->dev, phys)) {
 845                dd_dev_err(dd, "Failed to DMA map Exp Rcv pages 0x%llx\n",
 846                           phys);
 847                kfree(node);
 848                return -EFAULT;
 849        }
 850
 851        node->mmu.addr = vaddr;
 852        node->mmu.len = npages * PAGE_SIZE;
 853        node->phys = page_to_phys(pages[0]);
 854        node->npages = npages;
 855        node->rcventry = rcventry;
 856        node->dma_addr = phys;
 857        node->grp = grp;
 858        node->freed = false;
 859        memcpy(node->pages, pages, sizeof(struct page *) * npages);
 860
 861        if (!fd->handler)
 862                ret = tid_rb_insert(fd, &node->mmu);
 863        else
 864                ret = hfi1_mmu_rb_insert(fd->handler, &node->mmu);
 865
 866        if (ret) {
 867                hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d",
 868                          node->rcventry, node->mmu.addr, node->phys, ret);
 869                pci_unmap_single(dd->pcidev, phys, npages * PAGE_SIZE,
 870                                 PCI_DMA_FROMDEVICE);
 871                kfree(node);
 872                return -EFAULT;
 873        }
 874        hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1);
 875        trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry, npages,
 876                               node->mmu.addr, node->phys, phys);
 877        return 0;
 878}
 879
 880static int unprogram_rcvarray(struct file *fp, u32 tidinfo,
 881                              struct tid_group **grp)
 882{
 883        struct hfi1_filedata *fd = fp->private_data;
 884        struct hfi1_ctxtdata *uctxt = fd->uctxt;
 885        struct hfi1_devdata *dd = uctxt->dd;
 886        struct tid_rb_node *node;
 887        u8 tidctrl = EXP_TID_GET(tidinfo, CTRL);
 888        u32 tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry;
 889
 890        if (tididx >= uctxt->expected_count) {
 891                dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n",
 892                           tididx, uctxt->ctxt);
 893                return -EINVAL;
 894        }
 895
 896        if (tidctrl == 0x3)
 897                return -EINVAL;
 898
 899        rcventry = tididx + (tidctrl - 1);
 900
 901        node = fd->entry_to_rb[rcventry];
 902        if (!node || node->rcventry != (uctxt->expected_base + rcventry))
 903                return -EBADF;
 904
 905        if (grp)
 906                *grp = node->grp;
 907
 908        if (!fd->handler)
 909                cacheless_tid_rb_remove(fd, node);
 910        else
 911                hfi1_mmu_rb_remove(fd->handler, &node->mmu);
 912
 913        return 0;
 914}
 915
 916static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
 917{
 918        struct hfi1_ctxtdata *uctxt = fd->uctxt;
 919        struct hfi1_devdata *dd = uctxt->dd;
 920
 921        trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry,
 922                                 node->npages, node->mmu.addr, node->phys,
 923                                 node->dma_addr);
 924
 925        hfi1_put_tid(dd, node->rcventry, PT_INVALID, 0, 0);
 926        /*
 927         * Make sure device has seen the write before we unpin the
 928         * pages.
 929         */
 930        flush_wc();
 931
 932        pci_unmap_single(dd->pcidev, node->dma_addr, node->mmu.len,
 933                         PCI_DMA_FROMDEVICE);
 934        hfi1_release_user_pages(fd->mm, node->pages, node->npages, true);
 935        fd->tid_n_pinned -= node->npages;
 936
 937        node->grp->used--;
 938        node->grp->map &= ~(1 << (node->rcventry - node->grp->base));
 939
 940        if (node->grp->used == node->grp->size - 1)
 941                tid_group_move(node->grp, &uctxt->tid_full_list,
 942                               &uctxt->tid_used_list);
 943        else if (!node->grp->used)
 944                tid_group_move(node->grp, &uctxt->tid_used_list,
 945                               &uctxt->tid_group_list);
 946        kfree(node);
 947}
 948
 949/*
 950 * As a simple helper for hfi1_user_exp_rcv_free, this function deals with
 951 * clearing nodes in the non-cached case.
 952 */
 953static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
 954                            struct exp_tid_set *set,
 955                            struct hfi1_filedata *fd)
 956{
 957        struct tid_group *grp, *ptr;
 958        int i;
 959
 960        list_for_each_entry_safe(grp, ptr, &set->list, list) {
 961                list_del_init(&grp->list);
 962
 963                for (i = 0; i < grp->size; i++) {
 964                        if (grp->map & (1 << i)) {
 965                                u16 rcventry = grp->base + i;
 966                                struct tid_rb_node *node;
 967
 968                                node = fd->entry_to_rb[rcventry -
 969                                                          uctxt->expected_base];
 970                                if (!node || node->rcventry != rcventry)
 971                                        continue;
 972
 973                                cacheless_tid_rb_remove(fd, node);
 974                        }
 975                }
 976        }
 977}
 978
 979/*
 980 * Always return 0 from this function.  A non-zero return indicates that the
 981 * remove operation will be called and that memory should be unpinned.
 982 * However, the driver cannot unpin out from under PSM.  Instead, retain the
 983 * memory (by returning 0) and inform PSM that the memory is going away.  PSM
 984 * will call back later when it has removed the memory from its list.
 985 */
 986static int tid_rb_invalidate(void *arg, struct mmu_rb_node *mnode)
 987{
 988        struct hfi1_filedata *fdata = arg;
 989        struct hfi1_ctxtdata *uctxt = fdata->uctxt;
 990        struct tid_rb_node *node =
 991                container_of(mnode, struct tid_rb_node, mmu);
 992
 993        if (node->freed)
 994                return 0;
 995
 996        trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt, node->mmu.addr,
 997                                 node->rcventry, node->npages, node->dma_addr);
 998        node->freed = true;
 999
1000        spin_lock(&fdata->invalid_lock);

1001        if (fdata->invalid_tid_idx < uctxt->expected_count) {
1002                fdata->invalid_tids[fdata->invalid_tid_idx] =
1003                        rcventry2tidinfo(node->rcventry - uctxt->expected_base);
1004                fdata->invalid_tids[fdata->invalid_tid_idx] |=
1005                        EXP_TID_SET(LEN, node->npages);
1006                if (!fdata->invalid_tid_idx) {
1007                        unsigned long *ev;
1008
1009                        /*
1010                         * hfi1_set_uevent_bits() sets a user event flag
1011                         * for all processes. Because calling into the
1012                         * driver to process TID cache invalidations is
1013                         * expensive and TID cache invalidations are
1014                         * handled on a per-process basis, we can
1015                         * optimize this to set the flag only for the
1016                         * process in question.
1017                         */
1018                        ev = uctxt->dd->events +
1019                                (((uctxt->ctxt - uctxt->dd->first_user_ctxt) *
1020                                  HFI1_MAX_SHARED_CTXTS) + fdata->subctxt);
1021                        set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
1022                }
1023                fdata->invalid_tid_idx++;
1024        }
1025        spin_unlock(&fdata->invalid_lock);
1026        return 0;
1027}
1028
1029static int tid_rb_insert(void *arg, struct mmu_rb_node *node)
1030{
1031        struct hfi1_filedata *fdata = arg;
1032        struct tid_rb_node *tnode =
1033                container_of(node, struct tid_rb_node, mmu);
1034        u32 base = fdata->uctxt->expected_base;
1035
1036        fdata->entry_to_rb[tnode->rcventry - base] = tnode;
1037        return 0;
1038}
1039
1040static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
1041                                    struct tid_rb_node *tnode)
1042{
1043        u32 base = fdata->uctxt->expected_base;
1044
1045        fdata->entry_to_rb[tnode->rcventry - base] = NULL;
1046        clear_tid_node(fdata, tnode);
1047}
1048
1049static void tid_rb_remove(void *arg, struct mmu_rb_node *node)
1050{
1051        struct hfi1_filedata *fdata = arg;
1052        struct tid_rb_node *tnode =
1053                container_of(node, struct tid_rb_node, mmu);
1054
1055        cacheless_tid_rb_remove(fdata, tnode);
1056}
1057