linux/drivers/infiniband/sw/rdmavt/qp.c
<<
>>
Prefs
   1/*
   2 * Copyright(c) 2016 Intel Corporation.
   3 *
   4 * This file is provided under a dual BSD/GPLv2 license.  When using or
   5 * redistributing this file, you may do so under either license.
   6 *
   7 * GPL LICENSE SUMMARY
   8 *
   9 * This program is free software; you can redistribute it and/or modify
  10 * it under the terms of version 2 of the GNU General Public License as
  11 * published by the Free Software Foundation.
  12 *
  13 * This program is distributed in the hope that it will be useful, but
  14 * WITHOUT ANY WARRANTY; without even the implied warranty of
  15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16 * General Public License for more details.
  17 *
  18 * BSD LICENSE
  19 *
  20 * Redistribution and use in source and binary forms, with or without
  21 * modification, are permitted provided that the following conditions
  22 * are met:
  23 *
  24 *  - Redistributions of source code must retain the above copyright
  25 *    notice, this list of conditions and the following disclaimer.
  26 *  - Redistributions in binary form must reproduce the above copyright
  27 *    notice, this list of conditions and the following disclaimer in
  28 *    the documentation and/or other materials provided with the
  29 *    distribution.
  30 *  - Neither the name of Intel Corporation nor the names of its
  31 *    contributors may be used to endorse or promote products derived
  32 *    from this software without specific prior written permission.
  33 *
  34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  45 *
  46 */
  47
  48#include <linux/hash.h>
  49#include <linux/bitops.h>
  50#include <linux/lockdep.h>
  51#include <linux/vmalloc.h>
  52#include <linux/slab.h>
  53#include <rdma/ib_verbs.h>
  54#include "qp.h"
  55#include "vt.h"
  56#include "trace.h"
  57
  58/*
  59 * Note that it is OK to post send work requests in the SQE and ERR
  60 * states; rvt_do_send() will process them and generate error
  61 * completions as per IB 1.2 C10-96.
  62 */
  63const int ib_rvt_state_ops[IB_QPS_ERR + 1] = {
  64        [IB_QPS_RESET] = 0,
  65        [IB_QPS_INIT] = RVT_POST_RECV_OK,
  66        [IB_QPS_RTR] = RVT_POST_RECV_OK | RVT_PROCESS_RECV_OK,
  67        [IB_QPS_RTS] = RVT_POST_RECV_OK | RVT_PROCESS_RECV_OK |
  68            RVT_POST_SEND_OK | RVT_PROCESS_SEND_OK |
  69            RVT_PROCESS_NEXT_SEND_OK,
  70        [IB_QPS_SQD] = RVT_POST_RECV_OK | RVT_PROCESS_RECV_OK |
  71            RVT_POST_SEND_OK | RVT_PROCESS_SEND_OK,
  72        [IB_QPS_SQE] = RVT_POST_RECV_OK | RVT_PROCESS_RECV_OK |
  73            RVT_POST_SEND_OK | RVT_FLUSH_SEND,
  74        [IB_QPS_ERR] = RVT_POST_RECV_OK | RVT_FLUSH_RECV |
  75            RVT_POST_SEND_OK | RVT_FLUSH_SEND,
  76};
  77EXPORT_SYMBOL(ib_rvt_state_ops);
  78
  79static void get_map_page(struct rvt_qpn_table *qpt,
  80                         struct rvt_qpn_map *map,
  81                         gfp_t gfp)
  82{
  83        unsigned long page = get_zeroed_page(gfp);
  84
  85        /*
  86         * Free the page if someone raced with us installing it.
  87         */
  88
  89        spin_lock(&qpt->lock);
  90        if (map->page)
  91                free_page(page);
  92        else
  93                map->page = (void *)page;
  94        spin_unlock(&qpt->lock);
  95}
  96
  97/**
  98 * init_qpn_table - initialize the QP number table for a device
  99 * @qpt: the QPN table
 100 */
 101static int init_qpn_table(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt)
 102{
 103        u32 offset, i;
 104        struct rvt_qpn_map *map;
 105        int ret = 0;
 106
 107        if (!(rdi->dparms.qpn_res_end >= rdi->dparms.qpn_res_start))
 108                return -EINVAL;
 109
 110        spin_lock_init(&qpt->lock);
 111
 112        qpt->last = rdi->dparms.qpn_start;
 113        qpt->incr = rdi->dparms.qpn_inc << rdi->dparms.qos_shift;
 114
 115        /*
 116         * Drivers may want some QPs beyond what we need for verbs let them use
 117         * our qpn table. No need for two. Lets go ahead and mark the bitmaps
 118         * for those. The reserved range must be *after* the range which verbs
 119         * will pick from.
 120         */
 121
 122        /* Figure out number of bit maps needed before reserved range */
 123        qpt->nmaps = rdi->dparms.qpn_res_start / RVT_BITS_PER_PAGE;
 124
 125        /* This should always be zero */
 126        offset = rdi->dparms.qpn_res_start & RVT_BITS_PER_PAGE_MASK;
 127
 128        /* Starting with the first reserved bit map */
 129        map = &qpt->map[qpt->nmaps];
 130
 131        rvt_pr_info(rdi, "Reserving QPNs from 0x%x to 0x%x for non-verbs use\n",
 132                    rdi->dparms.qpn_res_start, rdi->dparms.qpn_res_end);
 133        for (i = rdi->dparms.qpn_res_start; i <= rdi->dparms.qpn_res_end; i++) {
 134                if (!map->page) {
 135                        get_map_page(qpt, map, GFP_KERNEL);
 136                        if (!map->page) {
 137                                ret = -ENOMEM;
 138                                break;
 139                        }
 140                }
 141                set_bit(offset, map->page);
 142                offset++;
 143                if (offset == RVT_BITS_PER_PAGE) {
 144                        /* next page */
 145                        qpt->nmaps++;
 146                        map++;
 147                        offset = 0;
 148                }
 149        }
 150        return ret;
 151}
 152
 153/**
 154 * free_qpn_table - free the QP number table for a device
 155 * @qpt: the QPN table
 156 */
 157static void free_qpn_table(struct rvt_qpn_table *qpt)
 158{
 159        int i;
 160
 161        for (i = 0; i < ARRAY_SIZE(qpt->map); i++)
 162                free_page((unsigned long)qpt->map[i].page);
 163}
 164
 165/**
 166 * rvt_driver_qp_init - Init driver qp resources
 167 * @rdi: rvt dev strucutre
 168 *
 169 * Return: 0 on success
 170 */
 171int rvt_driver_qp_init(struct rvt_dev_info *rdi)
 172{
 173        int i;
 174        int ret = -ENOMEM;
 175
 176        if (!rdi->dparms.qp_table_size)
 177                return -EINVAL;
 178
 179        /*
 180         * If driver is not doing any QP allocation then make sure it is
 181         * providing the necessary QP functions.
 182         */
 183        if (!rdi->driver_f.free_all_qps ||
 184            !rdi->driver_f.qp_priv_alloc ||
 185            !rdi->driver_f.qp_priv_free ||
 186            !rdi->driver_f.notify_qp_reset)
 187                return -EINVAL;
 188
 189        /* allocate parent object */
 190        rdi->qp_dev = kzalloc_node(sizeof(*rdi->qp_dev), GFP_KERNEL,
 191                                   rdi->dparms.node);
 192        if (!rdi->qp_dev)
 193                return -ENOMEM;
 194
 195        /* allocate hash table */
 196        rdi->qp_dev->qp_table_size = rdi->dparms.qp_table_size;
 197        rdi->qp_dev->qp_table_bits = ilog2(rdi->dparms.qp_table_size);
 198        rdi->qp_dev->qp_table =
 199                kmalloc_node(rdi->qp_dev->qp_table_size *
 200                             sizeof(*rdi->qp_dev->qp_table),
 201                             GFP_KERNEL, rdi->dparms.node);
 202        if (!rdi->qp_dev->qp_table)
 203                goto no_qp_table;
 204
 205        for (i = 0; i < rdi->qp_dev->qp_table_size; i++)
 206                RCU_INIT_POINTER(rdi->qp_dev->qp_table[i], NULL);
 207
 208        spin_lock_init(&rdi->qp_dev->qpt_lock);
 209
 210        /* initialize qpn map */
 211        if (init_qpn_table(rdi, &rdi->qp_dev->qpn_table))
 212                goto fail_table;
 213
 214        spin_lock_init(&rdi->n_qps_lock);
 215
 216        return 0;
 217
 218fail_table:
 219        kfree(rdi->qp_dev->qp_table);
 220        free_qpn_table(&rdi->qp_dev->qpn_table);
 221
 222no_qp_table:
 223        kfree(rdi->qp_dev);
 224
 225        return ret;
 226}
 227
 228/**
 229 * free_all_qps - check for QPs still in use
 230 * @qpt: the QP table to empty
 231 *
 232 * There should not be any QPs still in use.
 233 * Free memory for table.
 234 */
 235static unsigned rvt_free_all_qps(struct rvt_dev_info *rdi)
 236{
 237        unsigned long flags;
 238        struct rvt_qp *qp;
 239        unsigned n, qp_inuse = 0;
 240        spinlock_t *ql; /* work around too long line below */
 241
 242        if (rdi->driver_f.free_all_qps)
 243                qp_inuse = rdi->driver_f.free_all_qps(rdi);
 244
 245        qp_inuse += rvt_mcast_tree_empty(rdi);
 246
 247        if (!rdi->qp_dev)
 248                return qp_inuse;
 249
 250        ql = &rdi->qp_dev->qpt_lock;
 251        spin_lock_irqsave(ql, flags);
 252        for (n = 0; n < rdi->qp_dev->qp_table_size; n++) {
 253                qp = rcu_dereference_protected(rdi->qp_dev->qp_table[n],
 254                                               lockdep_is_held(ql));
 255                RCU_INIT_POINTER(rdi->qp_dev->qp_table[n], NULL);
 256
 257                for (; qp; qp = rcu_dereference_protected(qp->next,
 258                                                          lockdep_is_held(ql)))
 259                        qp_inuse++;
 260        }
 261        spin_unlock_irqrestore(ql, flags);
 262        synchronize_rcu();
 263        return qp_inuse;
 264}
 265
 266/**
 267 * rvt_qp_exit - clean up qps on device exit
 268 * @rdi: rvt dev structure
 269 *
 270 * Check for qp leaks and free resources.
 271 */
 272void rvt_qp_exit(struct rvt_dev_info *rdi)
 273{
 274        u32 qps_inuse = rvt_free_all_qps(rdi);
 275
 276        if (qps_inuse)
 277                rvt_pr_err(rdi, "QP memory leak! %u still in use\n",
 278                           qps_inuse);
 279        if (!rdi->qp_dev)
 280                return;
 281
 282        kfree(rdi->qp_dev->qp_table);
 283        free_qpn_table(&rdi->qp_dev->qpn_table);
 284        kfree(rdi->qp_dev);
 285}
 286
 287static inline unsigned mk_qpn(struct rvt_qpn_table *qpt,
 288                              struct rvt_qpn_map *map, unsigned off)
 289{
 290        return (map - qpt->map) * RVT_BITS_PER_PAGE + off;
 291}
 292
 293/**
 294 * alloc_qpn - Allocate the next available qpn or zero/one for QP type
 295 *             IB_QPT_SMI/IB_QPT_GSI
 296 *@rdi: rvt device info structure
 297 *@qpt: queue pair number table pointer
 298 *@port_num: IB port number, 1 based, comes from core
 299 *
 300 * Return: The queue pair number
 301 */
 302static int alloc_qpn(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt,
 303                     enum ib_qp_type type, u8 port_num, gfp_t gfp)
 304{
 305        u32 i, offset, max_scan, qpn;
 306        struct rvt_qpn_map *map;
 307        u32 ret;
 308
 309        if (rdi->driver_f.alloc_qpn)
 310                return rdi->driver_f.alloc_qpn(rdi, qpt, type, port_num, gfp);
 311
 312        if (type == IB_QPT_SMI || type == IB_QPT_GSI) {
 313                unsigned n;
 314
 315                ret = type == IB_QPT_GSI;
 316                n = 1 << (ret + 2 * (port_num - 1));
 317                spin_lock(&qpt->lock);
 318                if (qpt->flags & n)
 319                        ret = -EINVAL;
 320                else
 321                        qpt->flags |= n;
 322                spin_unlock(&qpt->lock);
 323                goto bail;
 324        }
 325
 326        qpn = qpt->last + qpt->incr;
 327        if (qpn >= RVT_QPN_MAX)
 328                qpn = qpt->incr | ((qpt->last & 1) ^ 1);
 329        /* offset carries bit 0 */
 330        offset = qpn & RVT_BITS_PER_PAGE_MASK;
 331        map = &qpt->map[qpn / RVT_BITS_PER_PAGE];
 332        max_scan = qpt->nmaps - !offset;
 333        for (i = 0;;) {
 334                if (unlikely(!map->page)) {
 335                        get_map_page(qpt, map, gfp);
 336                        if (unlikely(!map->page))
 337                                break;
 338                }
 339                do {
 340                        if (!test_and_set_bit(offset, map->page)) {
 341                                qpt->last = qpn;
 342                                ret = qpn;
 343                                goto bail;
 344                        }
 345                        offset += qpt->incr;
 346                        /*
 347                         * This qpn might be bogus if offset >= BITS_PER_PAGE.
 348                         * That is OK.   It gets re-assigned below
 349                         */
 350                        qpn = mk_qpn(qpt, map, offset);
 351                } while (offset < RVT_BITS_PER_PAGE && qpn < RVT_QPN_MAX);
 352                /*
 353                 * In order to keep the number of pages allocated to a
 354                 * minimum, we scan the all existing pages before increasing
 355                 * the size of the bitmap table.
 356                 */
 357                if (++i > max_scan) {
 358                        if (qpt->nmaps == RVT_QPNMAP_ENTRIES)
 359                                break;
 360                        map = &qpt->map[qpt->nmaps++];
 361                        /* start at incr with current bit 0 */
 362                        offset = qpt->incr | (offset & 1);
 363                } else if (map < &qpt->map[qpt->nmaps]) {
 364                        ++map;
 365                        /* start at incr with current bit 0 */
 366                        offset = qpt->incr | (offset & 1);
 367                } else {
 368                        map = &qpt->map[0];
 369                        /* wrap to first map page, invert bit 0 */
 370                        offset = qpt->incr | ((offset & 1) ^ 1);
 371                }
 372                /* there can be no set bits in low-order QoS bits */
 373                WARN_ON(offset & (BIT(rdi->dparms.qos_shift) - 1));
 374                qpn = mk_qpn(qpt, map, offset);
 375        }
 376
 377        ret = -ENOMEM;
 378
 379bail:
 380        return ret;
 381}
 382
 383static void free_qpn(struct rvt_qpn_table *qpt, u32 qpn)
 384{
 385        struct rvt_qpn_map *map;
 386
 387        map = qpt->map + qpn / RVT_BITS_PER_PAGE;
 388        if (map->page)
 389                clear_bit(qpn & RVT_BITS_PER_PAGE_MASK, map->page);
 390}
 391
 392/**
 393 * rvt_clear_mr_refs - Drop help mr refs
 394 * @qp: rvt qp data structure
 395 * @clr_sends: If shoudl clear send side or not
 396 */
 397static void rvt_clear_mr_refs(struct rvt_qp *qp, int clr_sends)
 398{
 399        unsigned n;
 400        struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
 401
 402        if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags))
 403                rvt_put_ss(&qp->s_rdma_read_sge);
 404
 405        rvt_put_ss(&qp->r_sge);
 406
 407        if (clr_sends) {
 408                while (qp->s_last != qp->s_head) {
 409                        struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_last);
 410                        unsigned i;
 411
 412                        for (i = 0; i < wqe->wr.num_sge; i++) {
 413                                struct rvt_sge *sge = &wqe->sg_list[i];
 414
 415                                rvt_put_mr(sge->mr);
 416                        }
 417                        if (qp->ibqp.qp_type == IB_QPT_UD ||
 418                            qp->ibqp.qp_type == IB_QPT_SMI ||
 419                            qp->ibqp.qp_type == IB_QPT_GSI)
 420                                atomic_dec(&ibah_to_rvtah(
 421                                                wqe->ud_wr.ah)->refcount);
 422                        if (++qp->s_last >= qp->s_size)
 423                                qp->s_last = 0;
 424                        smp_wmb(); /* see qp_set_savail */
 425                }
 426                if (qp->s_rdma_mr) {
 427                        rvt_put_mr(qp->s_rdma_mr);
 428                        qp->s_rdma_mr = NULL;
 429                }
 430        }
 431
 432        if (qp->ibqp.qp_type != IB_QPT_RC)
 433                return;
 434
 435        for (n = 0; n < rvt_max_atomic(rdi); n++) {
 436                struct rvt_ack_entry *e = &qp->s_ack_queue[n];
 437
 438                if (e->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST &&
 439                    e->rdma_sge.mr) {
 440                        rvt_put_mr(e->rdma_sge.mr);
 441                        e->rdma_sge.mr = NULL;
 442                }
 443        }
 444}
 445
 446/**
 447 * rvt_remove_qp - remove qp form table
 448 * @rdi: rvt dev struct
 449 * @qp: qp to remove
 450 *
 451 * Remove the QP from the table so it can't be found asynchronously by
 452 * the receive routine.
 453 */
 454static void rvt_remove_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp)
 455{
 456        struct rvt_ibport *rvp = rdi->ports[qp->port_num - 1];
 457        u32 n = hash_32(qp->ibqp.qp_num, rdi->qp_dev->qp_table_bits);
 458        unsigned long flags;
 459        int removed = 1;
 460
 461        spin_lock_irqsave(&rdi->qp_dev->qpt_lock, flags);
 462
 463        if (rcu_dereference_protected(rvp->qp[0],
 464                        lockdep_is_held(&rdi->qp_dev->qpt_lock)) == qp) {
 465                RCU_INIT_POINTER(rvp->qp[0], NULL);
 466        } else if (rcu_dereference_protected(rvp->qp[1],
 467                        lockdep_is_held(&rdi->qp_dev->qpt_lock)) == qp) {
 468                RCU_INIT_POINTER(rvp->qp[1], NULL);
 469        } else {
 470                struct rvt_qp *q;
 471                struct rvt_qp __rcu **qpp;
 472
 473                removed = 0;
 474                qpp = &rdi->qp_dev->qp_table[n];
 475                for (; (q = rcu_dereference_protected(*qpp,
 476                        lockdep_is_held(&rdi->qp_dev->qpt_lock))) != NULL;
 477                        qpp = &q->next) {
 478                        if (q == qp) {
 479                                RCU_INIT_POINTER(*qpp,
 480                                     rcu_dereference_protected(qp->next,
 481                                     lockdep_is_held(&rdi->qp_dev->qpt_lock)));
 482                                removed = 1;
 483                                trace_rvt_qpremove(qp, n);
 484                                break;
 485                        }
 486                }
 487        }
 488
 489        spin_unlock_irqrestore(&rdi->qp_dev->qpt_lock, flags);
 490        if (removed) {
 491                synchronize_rcu();
 492                if (atomic_dec_and_test(&qp->refcount))
 493                        wake_up(&qp->wait);
 494        }
 495}
 496
 497/**
 498 * reset_qp - initialize the QP state to the reset state
 499 * @qp: the QP to reset
 500 * @type: the QP type
 501 * r and s lock are required to be held by the caller
 502 */
 503static void rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
 504                  enum ib_qp_type type)
 505        __releases(&qp->s_lock)
 506        __releases(&qp->s_hlock)
 507        __releases(&qp->r_lock)
 508        __acquires(&qp->r_lock)
 509        __acquires(&qp->s_hlock)
 510        __acquires(&qp->s_lock)
 511{
 512        if (qp->state != IB_QPS_RESET) {
 513                qp->state = IB_QPS_RESET;
 514
 515                /* Let drivers flush their waitlist */
 516                rdi->driver_f.flush_qp_waiters(qp);
 517                qp->s_flags &= ~(RVT_S_TIMER | RVT_S_ANY_WAIT);
 518                spin_unlock(&qp->s_lock);
 519                spin_unlock(&qp->s_hlock);
 520                spin_unlock_irq(&qp->r_lock);
 521
 522                /* Stop the send queue and the retry timer */
 523                rdi->driver_f.stop_send_queue(qp);
 524
 525                /* Wait for things to stop */
 526                rdi->driver_f.quiesce_qp(qp);
 527
 528                /* take qp out the hash and wait for it to be unused */
 529                rvt_remove_qp(rdi, qp);
 530                wait_event(qp->wait, !atomic_read(&qp->refcount));
 531
 532                /* grab the lock b/c it was locked at call time */
 533                spin_lock_irq(&qp->r_lock);
 534                spin_lock(&qp->s_hlock);
 535                spin_lock(&qp->s_lock);
 536
 537                rvt_clear_mr_refs(qp, 1);
 538        }
 539
 540        /*
 541         * Let the driver do any tear down it needs to for a qp
 542         * that has been reset
 543         */
 544        rdi->driver_f.notify_qp_reset(qp);
 545
 546        qp->remote_qpn = 0;
 547        qp->qkey = 0;
 548        qp->qp_access_flags = 0;
 549        qp->s_flags &= RVT_S_SIGNAL_REQ_WR;
 550        qp->s_hdrwords = 0;
 551        qp->s_wqe = NULL;
 552        qp->s_draining = 0;
 553        qp->s_next_psn = 0;
 554        qp->s_last_psn = 0;
 555        qp->s_sending_psn = 0;
 556        qp->s_sending_hpsn = 0;
 557        qp->s_psn = 0;
 558        qp->r_psn = 0;
 559        qp->r_msn = 0;
 560        if (type == IB_QPT_RC) {
 561                qp->s_state = IB_OPCODE_RC_SEND_LAST;
 562                qp->r_state = IB_OPCODE_RC_SEND_LAST;
 563        } else {
 564                qp->s_state = IB_OPCODE_UC_SEND_LAST;
 565                qp->r_state = IB_OPCODE_UC_SEND_LAST;
 566        }
 567        qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
 568        qp->r_nak_state = 0;
 569        qp->r_aflags = 0;
 570        qp->r_flags = 0;
 571        qp->s_head = 0;
 572        qp->s_tail = 0;
 573        qp->s_cur = 0;
 574        qp->s_acked = 0;
 575        qp->s_last = 0;
 576        qp->s_ssn = 1;
 577        qp->s_lsn = 0;
 578        qp->s_mig_state = IB_MIG_MIGRATED;
 579        qp->r_head_ack_queue = 0;
 580        qp->s_tail_ack_queue = 0;
 581        qp->s_num_rd_atomic = 0;
 582        if (qp->r_rq.wq) {
 583                qp->r_rq.wq->head = 0;
 584                qp->r_rq.wq->tail = 0;
 585        }
 586        qp->r_sge.num_sge = 0;
 587}
 588
 589/**
 590 * rvt_create_qp - create a queue pair for a device
 591 * @ibpd: the protection domain who's device we create the queue pair for
 592 * @init_attr: the attributes of the queue pair
 593 * @udata: user data for libibverbs.so
 594 *
 595 * Queue pair creation is mostly an rvt issue. However, drivers have their own
 596 * unique idea of what queue pair numbers mean. For instance there is a reserved
 597 * range for PSM.
 598 *
 599 * Return: the queue pair on success, otherwise returns an errno.
 600 *
 601 * Called by the ib_create_qp() core verbs function.
 602 */
 603struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
 604                            struct ib_qp_init_attr *init_attr,
 605                            struct ib_udata *udata)
 606{
 607        struct rvt_qp *qp;
 608        int err;
 609        struct rvt_swqe *swq = NULL;
 610        size_t sz;
 611        size_t sg_list_sz;
 612        struct ib_qp *ret = ERR_PTR(-ENOMEM);
 613        struct rvt_dev_info *rdi = ib_to_rvt(ibpd->device);
 614        void *priv = NULL;
 615        gfp_t gfp;
 616
 617        if (!rdi)
 618                return ERR_PTR(-EINVAL);
 619
 620        if (init_attr->cap.max_send_sge > rdi->dparms.props.max_sge ||
 621            init_attr->cap.max_send_wr > rdi->dparms.props.max_qp_wr ||
 622            init_attr->create_flags & ~(IB_QP_CREATE_USE_GFP_NOIO))
 623                return ERR_PTR(-EINVAL);
 624
 625        /* GFP_NOIO is applicable to RC QP's only */
 626
 627        if (init_attr->create_flags & IB_QP_CREATE_USE_GFP_NOIO &&
 628            init_attr->qp_type != IB_QPT_RC)
 629                return ERR_PTR(-EINVAL);
 630
 631        gfp = init_attr->create_flags & IB_QP_CREATE_USE_GFP_NOIO ?
 632                                                GFP_NOIO : GFP_KERNEL;
 633
 634        /* Check receive queue parameters if no SRQ is specified. */
 635        if (!init_attr->srq) {
 636                if (init_attr->cap.max_recv_sge > rdi->dparms.props.max_sge ||
 637                    init_attr->cap.max_recv_wr > rdi->dparms.props.max_qp_wr)
 638                        return ERR_PTR(-EINVAL);
 639
 640                if (init_attr->cap.max_send_sge +
 641                    init_attr->cap.max_send_wr +
 642                    init_attr->cap.max_recv_sge +
 643                    init_attr->cap.max_recv_wr == 0)
 644                        return ERR_PTR(-EINVAL);
 645        }
 646
 647        switch (init_attr->qp_type) {
 648        case IB_QPT_SMI:
 649        case IB_QPT_GSI:
 650                if (init_attr->port_num == 0 ||
 651                    init_attr->port_num > ibpd->device->phys_port_cnt)
 652                        return ERR_PTR(-EINVAL);
 653        case IB_QPT_UC:
 654        case IB_QPT_RC:
 655        case IB_QPT_UD:
 656                sz = sizeof(struct rvt_sge) *
 657                        init_attr->cap.max_send_sge +
 658                        sizeof(struct rvt_swqe);
 659                if (gfp == GFP_NOIO)
 660                        swq = __vmalloc(
 661                                (init_attr->cap.max_send_wr + 1) * sz,
 662                                gfp | __GFP_ZERO, PAGE_KERNEL);
 663                else
 664                        swq = vzalloc_node(
 665                                (init_attr->cap.max_send_wr + 1) * sz,
 666                                rdi->dparms.node);
 667                if (!swq)
 668                        return ERR_PTR(-ENOMEM);
 669
 670                sz = sizeof(*qp);
 671                sg_list_sz = 0;
 672                if (init_attr->srq) {
 673                        struct rvt_srq *srq = ibsrq_to_rvtsrq(init_attr->srq);
 674
 675                        if (srq->rq.max_sge > 1)
 676                                sg_list_sz = sizeof(*qp->r_sg_list) *
 677                                        (srq->rq.max_sge - 1);
 678                } else if (init_attr->cap.max_recv_sge > 1)
 679                        sg_list_sz = sizeof(*qp->r_sg_list) *
 680                                (init_attr->cap.max_recv_sge - 1);
 681                qp = kzalloc_node(sz + sg_list_sz, gfp, rdi->dparms.node);
 682                if (!qp)
 683                        goto bail_swq;
 684
 685                RCU_INIT_POINTER(qp->next, NULL);
 686                if (init_attr->qp_type == IB_QPT_RC) {
 687                        qp->s_ack_queue =
 688                                kzalloc_node(
 689                                        sizeof(*qp->s_ack_queue) *
 690                                         rvt_max_atomic(rdi),
 691                                        gfp,
 692                                        rdi->dparms.node);
 693                        if (!qp->s_ack_queue)
 694                                goto bail_qp;
 695                }
 696
 697                /*
 698                 * Driver needs to set up it's private QP structure and do any
 699                 * initialization that is needed.
 700                 */
 701                priv = rdi->driver_f.qp_priv_alloc(rdi, qp, gfp);
 702                if (IS_ERR(priv)) {
 703                        ret = priv;
 704                        goto bail_qp;
 705                }
 706                qp->priv = priv;
 707                qp->timeout_jiffies =
 708                        usecs_to_jiffies((4096UL * (1UL << qp->timeout)) /
 709                                1000UL);
 710                if (init_attr->srq) {
 711                        sz = 0;
 712                } else {
 713                        qp->r_rq.size = init_attr->cap.max_recv_wr + 1;
 714                        qp->r_rq.max_sge = init_attr->cap.max_recv_sge;
 715                        sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) +
 716                                sizeof(struct rvt_rwqe);
 717                        if (udata)
 718                                qp->r_rq.wq = vmalloc_user(
 719                                                sizeof(struct rvt_rwq) +
 720                                                qp->r_rq.size * sz);
 721                        else if (gfp == GFP_NOIO)
 722                                qp->r_rq.wq = __vmalloc(
 723                                                sizeof(struct rvt_rwq) +
 724                                                qp->r_rq.size * sz,
 725                                                gfp | __GFP_ZERO, PAGE_KERNEL);
 726                        else
 727                                qp->r_rq.wq = vzalloc_node(
 728                                                sizeof(struct rvt_rwq) +
 729                                                qp->r_rq.size * sz,
 730                                                rdi->dparms.node);
 731                        if (!qp->r_rq.wq)
 732                                goto bail_driver_priv;
 733                }
 734
 735                /*
 736                 * ib_create_qp() will initialize qp->ibqp
 737                 * except for qp->ibqp.qp_num.
 738                 */
 739                spin_lock_init(&qp->r_lock);
 740                spin_lock_init(&qp->s_hlock);
 741                spin_lock_init(&qp->s_lock);
 742                spin_lock_init(&qp->r_rq.lock);
 743                atomic_set(&qp->refcount, 0);
 744                init_waitqueue_head(&qp->wait);
 745                init_timer(&qp->s_timer);
 746                qp->s_timer.data = (unsigned long)qp;
 747                INIT_LIST_HEAD(&qp->rspwait);
 748                qp->state = IB_QPS_RESET;
 749                qp->s_wq = swq;
 750                qp->s_size = init_attr->cap.max_send_wr + 1;
 751                qp->s_avail = init_attr->cap.max_send_wr;
 752                qp->s_max_sge = init_attr->cap.max_send_sge;
 753                if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR)
 754                        qp->s_flags = RVT_S_SIGNAL_REQ_WR;
 755
 756                err = alloc_qpn(rdi, &rdi->qp_dev->qpn_table,
 757                                init_attr->qp_type,
 758                                init_attr->port_num, gfp);
 759                if (err < 0) {
 760                        ret = ERR_PTR(err);
 761                        goto bail_rq_wq;
 762                }
 763                qp->ibqp.qp_num = err;
 764                qp->port_num = init_attr->port_num;
 765                rvt_reset_qp(rdi, qp, init_attr->qp_type);
 766                break;
 767
 768        default:
 769                /* Don't support raw QPs */
 770                return ERR_PTR(-EINVAL);
 771        }
 772
 773        init_attr->cap.max_inline_data = 0;
 774
 775        /*
 776         * Return the address of the RWQ as the offset to mmap.
 777         * See rvt_mmap() for details.
 778         */
 779        if (udata && udata->outlen >= sizeof(__u64)) {
 780                if (!qp->r_rq.wq) {
 781                        __u64 offset = 0;
 782
 783                        err = ib_copy_to_udata(udata, &offset,
 784                                               sizeof(offset));
 785                        if (err) {
 786                                ret = ERR_PTR(err);
 787                                goto bail_qpn;
 788                        }
 789                } else {
 790                        u32 s = sizeof(struct rvt_rwq) + qp->r_rq.size * sz;
 791
 792                        qp->ip = rvt_create_mmap_info(rdi, s,
 793                                                      ibpd->uobject->context,
 794                                                      qp->r_rq.wq);
 795                        if (!qp->ip) {
 796                                ret = ERR_PTR(-ENOMEM);
 797                                goto bail_qpn;
 798                        }
 799
 800                        err = ib_copy_to_udata(udata, &qp->ip->offset,
 801                                               sizeof(qp->ip->offset));
 802                        if (err) {
 803                                ret = ERR_PTR(err);
 804                                goto bail_ip;
 805                        }
 806                }
 807                qp->pid = current->pid;
 808        }
 809
 810        spin_lock(&rdi->n_qps_lock);
 811        if (rdi->n_qps_allocated == rdi->dparms.props.max_qp) {
 812                spin_unlock(&rdi->n_qps_lock);
 813                ret = ERR_PTR(-ENOMEM);
 814                goto bail_ip;
 815        }
 816
 817        rdi->n_qps_allocated++;
 818        /*
 819         * Maintain a busy_jiffies variable that will be added to the timeout
 820         * period in mod_retry_timer and add_retry_timer. This busy jiffies
 821         * is scaled by the number of rc qps created for the device to reduce
 822         * the number of timeouts occurring when there is a large number of
 823         * qps. busy_jiffies is incremented every rc qp scaling interval.
 824         * The scaling interval is selected based on extensive performance
 825         * evaluation of targeted workloads.
 826         */
 827        if (init_attr->qp_type == IB_QPT_RC) {
 828                rdi->n_rc_qps++;
 829                rdi->busy_jiffies = rdi->n_rc_qps / RC_QP_SCALING_INTERVAL;
 830        }
 831        spin_unlock(&rdi->n_qps_lock);
 832
 833        if (qp->ip) {
 834                spin_lock_irq(&rdi->pending_lock);
 835                list_add(&qp->ip->pending_mmaps, &rdi->pending_mmaps);
 836                spin_unlock_irq(&rdi->pending_lock);
 837        }
 838
 839        ret = &qp->ibqp;
 840
 841        /*
 842         * We have our QP and its good, now keep track of what types of opcodes
 843         * can be processed on this QP. We do this by keeping track of what the
 844         * 3 high order bits of the opcode are.
 845         */
 846        switch (init_attr->qp_type) {
 847        case IB_QPT_SMI:
 848        case IB_QPT_GSI:
 849        case IB_QPT_UD:
 850                qp->allowed_ops = IB_OPCODE_UD;
 851                break;
 852        case IB_QPT_RC:
 853                qp->allowed_ops = IB_OPCODE_RC;
 854                break;
 855        case IB_QPT_UC:
 856                qp->allowed_ops = IB_OPCODE_UC;
 857                break;
 858        default:
 859                ret = ERR_PTR(-EINVAL);
 860                goto bail_ip;
 861        }
 862
 863        return ret;
 864
 865bail_ip:
 866        kref_put(&qp->ip->ref, rvt_release_mmap_info);
 867
 868bail_qpn:
 869        free_qpn(&rdi->qp_dev->qpn_table, qp->ibqp.qp_num);
 870
 871bail_rq_wq:
 872        vfree(qp->r_rq.wq);
 873
 874bail_driver_priv:
 875        rdi->driver_f.qp_priv_free(rdi, qp);
 876
 877bail_qp:
 878        kfree(qp->s_ack_queue);
 879        kfree(qp);
 880
 881bail_swq:
 882        vfree(swq);
 883
 884        return ret;
 885}
 886
 887/**
 888 * rvt_error_qp - put a QP into the error state
 889 * @qp: the QP to put into the error state
 890 * @err: the receive completion error to signal if a RWQE is active
 891 *
 892 * Flushes both send and receive work queues.
 893 *
 894 * Return: true if last WQE event should be generated.
 895 * The QP r_lock and s_lock should be held and interrupts disabled.
 896 * If we are already in error state, just return.
 897 */
 898int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err)
 899{
 900        struct ib_wc wc;
 901        int ret = 0;
 902        struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
 903
 904        if (qp->state == IB_QPS_ERR || qp->state == IB_QPS_RESET)
 905                goto bail;
 906
 907        qp->state = IB_QPS_ERR;
 908
 909        if (qp->s_flags & (RVT_S_TIMER | RVT_S_WAIT_RNR)) {
 910                qp->s_flags &= ~(RVT_S_TIMER | RVT_S_WAIT_RNR);
 911                del_timer(&qp->s_timer);
 912        }
 913
 914        if (qp->s_flags & RVT_S_ANY_WAIT_SEND)
 915                qp->s_flags &= ~RVT_S_ANY_WAIT_SEND;
 916
 917        rdi->driver_f.notify_error_qp(qp);
 918
 919        /* Schedule the sending tasklet to drain the send work queue. */
 920        if (ACCESS_ONCE(qp->s_last) != qp->s_head)
 921                rdi->driver_f.schedule_send(qp);
 922
 923        rvt_clear_mr_refs(qp, 0);
 924
 925        memset(&wc, 0, sizeof(wc));
 926        wc.qp = &qp->ibqp;
 927        wc.opcode = IB_WC_RECV;
 928
 929        if (test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) {
 930                wc.wr_id = qp->r_wr_id;
 931                wc.status = err;
 932                rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
 933        }
 934        wc.status = IB_WC_WR_FLUSH_ERR;
 935
 936        if (qp->r_rq.wq) {
 937                struct rvt_rwq *wq;
 938                u32 head;
 939                u32 tail;
 940
 941                spin_lock(&qp->r_rq.lock);
 942
 943                /* sanity check pointers before trusting them */
 944                wq = qp->r_rq.wq;
 945                head = wq->head;
 946                if (head >= qp->r_rq.size)
 947                        head = 0;
 948                tail = wq->tail;
 949                if (tail >= qp->r_rq.size)
 950                        tail = 0;
 951                while (tail != head) {
 952                        wc.wr_id = rvt_get_rwqe_ptr(&qp->r_rq, tail)->wr_id;
 953                        if (++tail >= qp->r_rq.size)
 954                                tail = 0;
 955                        rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
 956                }
 957                wq->tail = tail;
 958
 959                spin_unlock(&qp->r_rq.lock);
 960        } else if (qp->ibqp.event_handler) {
 961                ret = 1;
 962        }
 963
 964bail:
 965        return ret;
 966}
 967EXPORT_SYMBOL(rvt_error_qp);
 968
 969/*
 970 * Put the QP into the hash table.
 971 * The hash table holds a reference to the QP.
 972 */
 973static void rvt_insert_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp)
 974{
 975        struct rvt_ibport *rvp = rdi->ports[qp->port_num - 1];
 976        unsigned long flags;
 977
 978        atomic_inc(&qp->refcount);
 979        spin_lock_irqsave(&rdi->qp_dev->qpt_lock, flags);
 980
 981        if (qp->ibqp.qp_num <= 1) {
 982                rcu_assign_pointer(rvp->qp[qp->ibqp.qp_num], qp);
 983        } else {
 984                u32 n = hash_32(qp->ibqp.qp_num, rdi->qp_dev->qp_table_bits);
 985
 986                qp->next = rdi->qp_dev->qp_table[n];
 987                rcu_assign_pointer(rdi->qp_dev->qp_table[n], qp);
 988                trace_rvt_qpinsert(qp, n);
 989        }
 990
 991        spin_unlock_irqrestore(&rdi->qp_dev->qpt_lock, flags);
 992}
 993
 994/**
 995 * qib_modify_qp - modify the attributes of a queue pair
 996 * @ibqp: the queue pair who's attributes we're modifying
 997 * @attr: the new attributes
 998 * @attr_mask: the mask of attributes to modify
 999 * @udata: user data for libibverbs.so
1000 *
1001 * Return: 0 on success, otherwise returns an errno.
1002 */
1003int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
1004                  int attr_mask, struct ib_udata *udata)
1005{
1006        struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
1007        struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
1008        enum ib_qp_state cur_state, new_state;
1009        struct ib_event ev;
1010        int lastwqe = 0;
1011        int mig = 0;
1012        int pmtu = 0; /* for gcc warning only */
1013        enum rdma_link_layer link;
1014
1015        link = rdma_port_get_link_layer(ibqp->device, qp->port_num);
1016
1017        spin_lock_irq(&qp->r_lock);
1018        spin_lock(&qp->s_hlock);
1019        spin_lock(&qp->s_lock);
1020
1021        cur_state = attr_mask & IB_QP_CUR_STATE ?
1022                attr->cur_qp_state : qp->state;
1023        new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
1024
1025        if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
1026                                attr_mask, link))
1027                goto inval;
1028
1029        if (rdi->driver_f.check_modify_qp &&
1030            rdi->driver_f.check_modify_qp(qp, attr, attr_mask, udata))
1031                goto inval;
1032
1033        if (attr_mask & IB_QP_AV) {
1034                if (attr->ah_attr.dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE))
1035                        goto inval;
1036                if (rvt_check_ah(qp->ibqp.device, &attr->ah_attr))
1037                        goto inval;
1038        }
1039
1040        if (attr_mask & IB_QP_ALT_PATH) {
1041                if (attr->alt_ah_attr.dlid >=
1042                    be16_to_cpu(IB_MULTICAST_LID_BASE))
1043                        goto inval;
1044                if (rvt_check_ah(qp->ibqp.device, &attr->alt_ah_attr))
1045                        goto inval;
1046                if (attr->alt_pkey_index >= rvt_get_npkeys(rdi))
1047                        goto inval;
1048        }
1049
1050        if (attr_mask & IB_QP_PKEY_INDEX)
1051                if (attr->pkey_index >= rvt_get_npkeys(rdi))
1052                        goto inval;
1053
1054        if (attr_mask & IB_QP_MIN_RNR_TIMER)
1055                if (attr->min_rnr_timer > 31)
1056                        goto inval;
1057
1058        if (attr_mask & IB_QP_PORT)
1059                if (qp->ibqp.qp_type == IB_QPT_SMI ||
1060                    qp->ibqp.qp_type == IB_QPT_GSI ||
1061                    attr->port_num == 0 ||
1062                    attr->port_num > ibqp->device->phys_port_cnt)
1063                        goto inval;
1064
1065        if (attr_mask & IB_QP_DEST_QPN)
1066                if (attr->dest_qp_num > RVT_QPN_MASK)
1067                        goto inval;
1068
1069        if (attr_mask & IB_QP_RETRY_CNT)
1070                if (attr->retry_cnt > 7)
1071                        goto inval;
1072
1073        if (attr_mask & IB_QP_RNR_RETRY)
1074                if (attr->rnr_retry > 7)
1075                        goto inval;
1076
1077        /*
1078         * Don't allow invalid path_mtu values.  OK to set greater
1079         * than the active mtu (or even the max_cap, if we have tuned
1080         * that to a small mtu.  We'll set qp->path_mtu
1081         * to the lesser of requested attribute mtu and active,
1082         * for packetizing messages.
1083         * Note that the QP port has to be set in INIT and MTU in RTR.
1084         */
1085        if (attr_mask & IB_QP_PATH_MTU) {
1086                pmtu = rdi->driver_f.get_pmtu_from_attr(rdi, qp, attr);
1087                if (pmtu < 0)
1088                        goto inval;
1089        }
1090
1091        if (attr_mask & IB_QP_PATH_MIG_STATE) {
1092                if (attr->path_mig_state == IB_MIG_REARM) {
1093                        if (qp->s_mig_state == IB_MIG_ARMED)
1094                                goto inval;
1095                        if (new_state != IB_QPS_RTS)
1096                                goto inval;
1097                } else if (attr->path_mig_state == IB_MIG_MIGRATED) {
1098                        if (qp->s_mig_state == IB_MIG_REARM)
1099                                goto inval;
1100                        if (new_state != IB_QPS_RTS && new_state != IB_QPS_SQD)
1101                                goto inval;
1102                        if (qp->s_mig_state == IB_MIG_ARMED)
1103                                mig = 1;
1104                } else {
1105                        goto inval;
1106                }
1107        }
1108
1109        if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
1110                if (attr->max_dest_rd_atomic > rdi->dparms.max_rdma_atomic)
1111                        goto inval;
1112
1113        switch (new_state) {
1114        case IB_QPS_RESET:
1115                if (qp->state != IB_QPS_RESET)
1116                        rvt_reset_qp(rdi, qp, ibqp->qp_type);
1117                break;
1118
1119        case IB_QPS_RTR:
1120                /* Allow event to re-trigger if QP set to RTR more than once */
1121                qp->r_flags &= ~RVT_R_COMM_EST;
1122                qp->state = new_state;
1123                break;
1124
1125        case IB_QPS_SQD:
1126                qp->s_draining = qp->s_last != qp->s_cur;
1127                qp->state = new_state;
1128                break;
1129
1130        case IB_QPS_SQE:
1131                if (qp->ibqp.qp_type == IB_QPT_RC)
1132                        goto inval;
1133                qp->state = new_state;
1134                break;
1135
1136        case IB_QPS_ERR:
1137                lastwqe = rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
1138                break;
1139
1140        default:
1141                qp->state = new_state;
1142                break;
1143        }
1144
1145        if (attr_mask & IB_QP_PKEY_INDEX)
1146                qp->s_pkey_index = attr->pkey_index;
1147
1148        if (attr_mask & IB_QP_PORT)
1149                qp->port_num = attr->port_num;
1150
1151        if (attr_mask & IB_QP_DEST_QPN)
1152                qp->remote_qpn = attr->dest_qp_num;
1153
1154        if (attr_mask & IB_QP_SQ_PSN) {
1155                qp->s_next_psn = attr->sq_psn & rdi->dparms.psn_modify_mask;
1156                qp->s_psn = qp->s_next_psn;
1157                qp->s_sending_psn = qp->s_next_psn;
1158                qp->s_last_psn = qp->s_next_psn - 1;
1159                qp->s_sending_hpsn = qp->s_last_psn;
1160        }
1161
1162        if (attr_mask & IB_QP_RQ_PSN)
1163                qp->r_psn = attr->rq_psn & rdi->dparms.psn_modify_mask;
1164
1165        if (attr_mask & IB_QP_ACCESS_FLAGS)
1166                qp->qp_access_flags = attr->qp_access_flags;
1167
1168        if (attr_mask & IB_QP_AV) {
1169                qp->remote_ah_attr = attr->ah_attr;
1170                qp->s_srate = attr->ah_attr.static_rate;
1171                qp->srate_mbps = ib_rate_to_mbps(qp->s_srate);
1172        }
1173
1174        if (attr_mask & IB_QP_ALT_PATH) {
1175                qp->alt_ah_attr = attr->alt_ah_attr;
1176                qp->s_alt_pkey_index = attr->alt_pkey_index;
1177        }
1178
1179        if (attr_mask & IB_QP_PATH_MIG_STATE) {
1180                qp->s_mig_state = attr->path_mig_state;
1181                if (mig) {
1182                        qp->remote_ah_attr = qp->alt_ah_attr;
1183                        qp->port_num = qp->alt_ah_attr.port_num;
1184                        qp->s_pkey_index = qp->s_alt_pkey_index;
1185                }
1186        }
1187
1188        if (attr_mask & IB_QP_PATH_MTU) {
1189                qp->pmtu = rdi->driver_f.mtu_from_qp(rdi, qp, pmtu);
1190                qp->path_mtu = rdi->driver_f.mtu_to_path_mtu(qp->pmtu);
1191                qp->log_pmtu = ilog2(qp->pmtu);
1192        }
1193
1194        if (attr_mask & IB_QP_RETRY_CNT) {
1195                qp->s_retry_cnt = attr->retry_cnt;
1196                qp->s_retry = attr->retry_cnt;
1197        }
1198
1199        if (attr_mask & IB_QP_RNR_RETRY) {
1200                qp->s_rnr_retry_cnt = attr->rnr_retry;
1201                qp->s_rnr_retry = attr->rnr_retry;
1202        }
1203
1204        if (attr_mask & IB_QP_MIN_RNR_TIMER)
1205                qp->r_min_rnr_timer = attr->min_rnr_timer;
1206
1207        if (attr_mask & IB_QP_TIMEOUT) {
1208                qp->timeout = attr->timeout;
1209                qp->timeout_jiffies =
1210                        usecs_to_jiffies((4096UL * (1UL << qp->timeout)) /
1211                                1000UL);
1212        }
1213
1214        if (attr_mask & IB_QP_QKEY)
1215                qp->qkey = attr->qkey;
1216
1217        if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
1218                qp->r_max_rd_atomic = attr->max_dest_rd_atomic;
1219
1220        if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC)
1221                qp->s_max_rd_atomic = attr->max_rd_atomic;
1222
1223        if (rdi->driver_f.modify_qp)
1224                rdi->driver_f.modify_qp(qp, attr, attr_mask, udata);
1225
1226        spin_unlock(&qp->s_lock);
1227        spin_unlock(&qp->s_hlock);
1228        spin_unlock_irq(&qp->r_lock);
1229
1230        if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
1231                rvt_insert_qp(rdi, qp);
1232
1233        if (lastwqe) {
1234                ev.device = qp->ibqp.device;
1235                ev.element.qp = &qp->ibqp;
1236                ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
1237                qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
1238        }
1239        if (mig) {
1240                ev.device = qp->ibqp.device;
1241                ev.element.qp = &qp->ibqp;
1242                ev.event = IB_EVENT_PATH_MIG;
1243                qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
1244        }
1245        return 0;
1246
1247inval:
1248        spin_unlock(&qp->s_lock);
1249        spin_unlock(&qp->s_hlock);
1250        spin_unlock_irq(&qp->r_lock);
1251        return -EINVAL;
1252}
1253
1254/** rvt_free_qpn - Free a qpn from the bit map
1255 * @qpt: QP table
1256 * @qpn: queue pair number to free
1257 */
1258static void rvt_free_qpn(struct rvt_qpn_table *qpt, u32 qpn)
1259{
1260        struct rvt_qpn_map *map;
1261
1262        map = qpt->map + qpn / RVT_BITS_PER_PAGE;
1263        if (map->page)
1264                clear_bit(qpn & RVT_BITS_PER_PAGE_MASK, map->page);
1265}
1266
1267/**
1268 * rvt_destroy_qp - destroy a queue pair
1269 * @ibqp: the queue pair to destroy
1270 *
1271 * Note that this can be called while the QP is actively sending or
1272 * receiving!
1273 *
1274 * Return: 0 on success.
1275 */
1276int rvt_destroy_qp(struct ib_qp *ibqp)
1277{
1278        struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
1279        struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
1280
1281        spin_lock_irq(&qp->r_lock);
1282        spin_lock(&qp->s_hlock);
1283        spin_lock(&qp->s_lock);
1284        rvt_reset_qp(rdi, qp, ibqp->qp_type);
1285        spin_unlock(&qp->s_lock);
1286        spin_unlock(&qp->s_hlock);
1287        spin_unlock_irq(&qp->r_lock);
1288
1289        /* qpn is now available for use again */
1290        rvt_free_qpn(&rdi->qp_dev->qpn_table, qp->ibqp.qp_num);
1291
1292        spin_lock(&rdi->n_qps_lock);
1293        rdi->n_qps_allocated--;
1294        if (qp->ibqp.qp_type == IB_QPT_RC) {
1295                rdi->n_rc_qps--;
1296                rdi->busy_jiffies = rdi->n_rc_qps / RC_QP_SCALING_INTERVAL;
1297        }
1298        spin_unlock(&rdi->n_qps_lock);
1299
1300        if (qp->ip)
1301                kref_put(&qp->ip->ref, rvt_release_mmap_info);
1302        else
1303                vfree(qp->r_rq.wq);
1304        vfree(qp->s_wq);
1305        rdi->driver_f.qp_priv_free(rdi, qp);
1306        kfree(qp->s_ack_queue);
1307        kfree(qp);
1308        return 0;
1309}
1310
1311/**
1312 * rvt_query_qp - query an ipbq
1313 * @ibqp: IB qp to query
1314 * @attr: attr struct to fill in
1315 * @attr_mask: attr mask ignored
1316 * @init_attr: struct to fill in
1317 *
1318 * Return: always 0
1319 */
1320int rvt_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
1321                 int attr_mask, struct ib_qp_init_attr *init_attr)
1322{
1323        struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
1324        struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
1325
1326        attr->qp_state = qp->state;
1327        attr->cur_qp_state = attr->qp_state;
1328        attr->path_mtu = qp->path_mtu;
1329        attr->path_mig_state = qp->s_mig_state;
1330        attr->qkey = qp->qkey;
1331        attr->rq_psn = qp->r_psn & rdi->dparms.psn_mask;
1332        attr->sq_psn = qp->s_next_psn & rdi->dparms.psn_mask;
1333        attr->dest_qp_num = qp->remote_qpn;
1334        attr->qp_access_flags = qp->qp_access_flags;
1335        attr->cap.max_send_wr = qp->s_size - 1;
1336        attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1;
1337        attr->cap.max_send_sge = qp->s_max_sge;
1338        attr->cap.max_recv_sge = qp->r_rq.max_sge;
1339        attr->cap.max_inline_data = 0;
1340        attr->ah_attr = qp->remote_ah_attr;
1341        attr->alt_ah_attr = qp->alt_ah_attr;
1342        attr->pkey_index = qp->s_pkey_index;
1343        attr->alt_pkey_index = qp->s_alt_pkey_index;
1344        attr->en_sqd_async_notify = 0;
1345        attr->sq_draining = qp->s_draining;
1346        attr->max_rd_atomic = qp->s_max_rd_atomic;
1347        attr->max_dest_rd_atomic = qp->r_max_rd_atomic;
1348        attr->min_rnr_timer = qp->r_min_rnr_timer;
1349        attr->port_num = qp->port_num;
1350        attr->timeout = qp->timeout;
1351        attr->retry_cnt = qp->s_retry_cnt;
1352        attr->rnr_retry = qp->s_rnr_retry_cnt;
1353        attr->alt_port_num = qp->alt_ah_attr.port_num;
1354        attr->alt_timeout = qp->alt_timeout;
1355
1356        init_attr->event_handler = qp->ibqp.event_handler;
1357        init_attr->qp_context = qp->ibqp.qp_context;
1358        init_attr->send_cq = qp->ibqp.send_cq;
1359        init_attr->recv_cq = qp->ibqp.recv_cq;
1360        init_attr->srq = qp->ibqp.srq;
1361        init_attr->cap = attr->cap;
1362        if (qp->s_flags & RVT_S_SIGNAL_REQ_WR)
1363                init_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
1364        else
1365                init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
1366        init_attr->qp_type = qp->ibqp.qp_type;
1367        init_attr->port_num = qp->port_num;
1368        return 0;
1369}
1370
1371/**
1372 * rvt_post_receive - post a receive on a QP
1373 * @ibqp: the QP to post the receive on
1374 * @wr: the WR to post
1375 * @bad_wr: the first bad WR is put here
1376 *
1377 * This may be called from interrupt context.
1378 *
1379 * Return: 0 on success otherwise errno
1380 */
1381int rvt_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
1382                  struct ib_recv_wr **bad_wr)
1383{
1384        struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
1385        struct rvt_rwq *wq = qp->r_rq.wq;
1386        unsigned long flags;
1387        int qp_err_flush = (ib_rvt_state_ops[qp->state] & RVT_FLUSH_RECV) &&
1388                                !qp->ibqp.srq;
1389
1390        /* Check that state is OK to post receive. */
1391        if (!(ib_rvt_state_ops[qp->state] & RVT_POST_RECV_OK) || !wq) {
1392                *bad_wr = wr;
1393                return -EINVAL;
1394        }
1395
1396        for (; wr; wr = wr->next) {
1397                struct rvt_rwqe *wqe;
1398                u32 next;
1399                int i;
1400
1401                if ((unsigned)wr->num_sge > qp->r_rq.max_sge) {
1402                        *bad_wr = wr;
1403                        return -EINVAL;
1404                }
1405
1406                spin_lock_irqsave(&qp->r_rq.lock, flags);
1407                next = wq->head + 1;
1408                if (next >= qp->r_rq.size)
1409                        next = 0;
1410                if (next == wq->tail) {
1411                        spin_unlock_irqrestore(&qp->r_rq.lock, flags);
1412                        *bad_wr = wr;
1413                        return -ENOMEM;
1414                }
1415                if (unlikely(qp_err_flush)) {
1416                        struct ib_wc wc;
1417
1418                        memset(&wc, 0, sizeof(wc));
1419                        wc.qp = &qp->ibqp;
1420                        wc.opcode = IB_WC_RECV;
1421                        wc.wr_id = wr->wr_id;
1422                        wc.status = IB_WC_WR_FLUSH_ERR;
1423                        rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
1424                } else {
1425                        wqe = rvt_get_rwqe_ptr(&qp->r_rq, wq->head);
1426                        wqe->wr_id = wr->wr_id;
1427                        wqe->num_sge = wr->num_sge;
1428                        for (i = 0; i < wr->num_sge; i++)
1429                                wqe->sg_list[i] = wr->sg_list[i];
1430                        /*
1431                         * Make sure queue entry is written
1432                         * before the head index.
1433                         */
1434                        smp_wmb();
1435                        wq->head = next;
1436                }
1437                spin_unlock_irqrestore(&qp->r_rq.lock, flags);
1438        }
1439        return 0;
1440}
1441
1442/**
1443 * qp_get_savail - return number of avail send entries
1444 *
1445 * @qp - the qp
1446 *
1447 * This assumes the s_hlock is held but the s_last
1448 * qp variable is uncontrolled.
1449 */
1450static inline u32 qp_get_savail(struct rvt_qp *qp)
1451{
1452        u32 slast;
1453        u32 ret;
1454
1455        smp_read_barrier_depends(); /* see rc.c */
1456        slast = ACCESS_ONCE(qp->s_last);
1457        if (qp->s_head >= slast)
1458                ret = qp->s_size - (qp->s_head - slast);
1459        else
1460                ret = slast - qp->s_head;
1461        return ret - 1;
1462}
1463
1464/**
1465 * rvt_post_one_wr - post one RC, UC, or UD send work request
1466 * @qp: the QP to post on
1467 * @wr: the work request to send
1468 */
1469static int rvt_post_one_wr(struct rvt_qp *qp,
1470                           struct ib_send_wr *wr,
1471                           int *call_send)
1472{
1473        struct rvt_swqe *wqe;
1474        u32 next;
1475        int i;
1476        int j;
1477        int acc;
1478        struct rvt_lkey_table *rkt;
1479        struct rvt_pd *pd;
1480        struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
1481        u8 log_pmtu;
1482        int ret;
1483
1484        /* IB spec says that num_sge == 0 is OK. */
1485        if (unlikely(wr->num_sge > qp->s_max_sge))
1486                return -EINVAL;
1487
1488        /*
1489         * Don't allow RDMA reads or atomic operations on UC or
1490         * undefined operations.
1491         * Make sure buffer is large enough to hold the result for atomics.
1492         */
1493        if (qp->ibqp.qp_type == IB_QPT_UC) {
1494                if ((unsigned)wr->opcode >= IB_WR_RDMA_READ)
1495                        return -EINVAL;
1496        } else if (qp->ibqp.qp_type != IB_QPT_RC) {
1497                /* Check IB_QPT_SMI, IB_QPT_GSI, IB_QPT_UD opcode */
1498                if (wr->opcode != IB_WR_SEND &&
1499                    wr->opcode != IB_WR_SEND_WITH_IMM)
1500                        return -EINVAL;
1501                /* Check UD destination address PD */
1502                if (qp->ibqp.pd != ud_wr(wr)->ah->pd)
1503                        return -EINVAL;
1504        } else if ((unsigned)wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD) {
1505                return -EINVAL;
1506        } else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP &&
1507                   (wr->num_sge == 0 ||
1508                    wr->sg_list[0].length < sizeof(u64) ||
1509                    wr->sg_list[0].addr & (sizeof(u64) - 1))) {
1510                return -EINVAL;
1511        } else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic) {
1512                return -EINVAL;
1513        }
1514        /* check for avail */
1515        if (unlikely(!qp->s_avail)) {
1516                qp->s_avail = qp_get_savail(qp);
1517                if (WARN_ON(qp->s_avail > (qp->s_size - 1)))
1518                        rvt_pr_err(rdi,
1519                                   "More avail entries than QP RB size.\nQP: %u, size: %u, avail: %u\nhead: %u, tail: %u, cur: %u, acked: %u, last: %u",
1520                                   qp->ibqp.qp_num, qp->s_size, qp->s_avail,
1521                                   qp->s_head, qp->s_tail, qp->s_cur,
1522                                   qp->s_acked, qp->s_last);
1523                if (!qp->s_avail)
1524                        return -ENOMEM;
1525        }
1526        next = qp->s_head + 1;
1527        if (next >= qp->s_size)
1528                next = 0;
1529
1530        rkt = &rdi->lkey_table;
1531        pd = ibpd_to_rvtpd(qp->ibqp.pd);
1532        wqe = rvt_get_swqe_ptr(qp, qp->s_head);
1533
1534        if (qp->ibqp.qp_type != IB_QPT_UC &&
1535            qp->ibqp.qp_type != IB_QPT_RC)
1536                memcpy(&wqe->ud_wr, ud_wr(wr), sizeof(wqe->ud_wr));
1537        else if (wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM ||
1538                 wr->opcode == IB_WR_RDMA_WRITE ||
1539                 wr->opcode == IB_WR_RDMA_READ)
1540                memcpy(&wqe->rdma_wr, rdma_wr(wr), sizeof(wqe->rdma_wr));
1541        else if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1542                 wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
1543                memcpy(&wqe->atomic_wr, atomic_wr(wr), sizeof(wqe->atomic_wr));
1544        else
1545                memcpy(&wqe->wr, wr, sizeof(wqe->wr));
1546
1547        wqe->length = 0;
1548        j = 0;
1549        if (wr->num_sge) {
1550                acc = wr->opcode >= IB_WR_RDMA_READ ?
1551                        IB_ACCESS_LOCAL_WRITE : 0;
1552                for (i = 0; i < wr->num_sge; i++) {
1553                        u32 length = wr->sg_list[i].length;
1554                        int ok;
1555
1556                        if (length == 0)
1557                                continue;
1558                        ok = rvt_lkey_ok(rkt, pd, &wqe->sg_list[j],
1559                                         &wr->sg_list[i], acc);
1560                        if (!ok) {
1561                                ret = -EINVAL;
1562                                goto bail_inval_free;
1563                        }
1564                        wqe->length += length;
1565                        j++;
1566                }
1567                wqe->wr.num_sge = j;
1568        }
1569
1570        /* general part of wqe valid - allow for driver checks */
1571        if (rdi->driver_f.check_send_wqe) {
1572                ret = rdi->driver_f.check_send_wqe(qp, wqe);
1573                if (ret < 0)
1574                        goto bail_inval_free;
1575                if (ret)
1576                        *call_send = ret;
1577        }
1578
1579        log_pmtu = qp->log_pmtu;
1580        if (qp->ibqp.qp_type != IB_QPT_UC &&
1581            qp->ibqp.qp_type != IB_QPT_RC) {
1582                struct rvt_ah *ah = ibah_to_rvtah(wqe->ud_wr.ah);
1583
1584                log_pmtu = ah->log_pmtu;
1585                atomic_inc(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount);
1586        }
1587
1588        wqe->ssn = qp->s_ssn++;
1589        wqe->psn = qp->s_next_psn;
1590        wqe->lpsn = wqe->psn +
1591                        (wqe->length ? ((wqe->length - 1) >> log_pmtu) : 0);
1592        qp->s_next_psn = wqe->lpsn + 1;
1593        trace_rvt_post_one_wr(qp, wqe);
1594        smp_wmb(); /* see request builders */
1595        qp->s_avail--;
1596        qp->s_head = next;
1597
1598        return 0;
1599
1600bail_inval_free:
1601        /* release mr holds */
1602        while (j) {
1603                struct rvt_sge *sge = &wqe->sg_list[--j];
1604
1605                rvt_put_mr(sge->mr);
1606        }
1607        return ret;
1608}
1609
1610/**
1611 * rvt_post_send - post a send on a QP
1612 * @ibqp: the QP to post the send on
1613 * @wr: the list of work requests to post
1614 * @bad_wr: the first bad WR is put here
1615 *
1616 * This may be called from interrupt context.
1617 *
1618 * Return: 0 on success else errno
1619 */
1620int rvt_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
1621                  struct ib_send_wr **bad_wr)
1622{
1623        struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
1624        struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
1625        unsigned long flags = 0;
1626        int call_send;
1627        unsigned nreq = 0;
1628        int err = 0;
1629
1630        spin_lock_irqsave(&qp->s_hlock, flags);
1631
1632        /*
1633         * Ensure QP state is such that we can send. If not bail out early,
1634         * there is no need to do this every time we post a send.
1635         */
1636        if (unlikely(!(ib_rvt_state_ops[qp->state] & RVT_POST_SEND_OK))) {
1637                spin_unlock_irqrestore(&qp->s_hlock, flags);
1638                return -EINVAL;
1639        }
1640
1641        /*
1642         * If the send queue is empty, and we only have a single WR then just go
1643         * ahead and kick the send engine into gear. Otherwise we will always
1644         * just schedule the send to happen later.
1645         */
1646        call_send = qp->s_head == ACCESS_ONCE(qp->s_last) && !wr->next;
1647
1648        for (; wr; wr = wr->next) {
1649                err = rvt_post_one_wr(qp, wr, &call_send);
1650                if (unlikely(err)) {
1651                        *bad_wr = wr;
1652                        goto bail;
1653                }
1654                nreq++;
1655        }
1656bail:
1657        spin_unlock_irqrestore(&qp->s_hlock, flags);
1658        if (nreq) {
1659                if (call_send)
1660                        rdi->driver_f.do_send(qp);
1661                else
1662                        rdi->driver_f.schedule_send_no_lock(qp);
1663        }
1664        return err;
1665}
1666
1667/**
1668 * rvt_post_srq_receive - post a receive on a shared receive queue
1669 * @ibsrq: the SRQ to post the receive on
1670 * @wr: the list of work requests to post
1671 * @bad_wr: A pointer to the first WR to cause a problem is put here
1672 *
1673 * This may be called from interrupt context.
1674 *
1675 * Return: 0 on success else errno
1676 */
1677int rvt_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
1678                      struct ib_recv_wr **bad_wr)
1679{
1680        struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq);
1681        struct rvt_rwq *wq;
1682        unsigned long flags;
1683
1684        for (; wr; wr = wr->next) {
1685                struct rvt_rwqe *wqe;
1686                u32 next;
1687                int i;
1688
1689                if ((unsigned)wr->num_sge > srq->rq.max_sge) {
1690                        *bad_wr = wr;
1691                        return -EINVAL;
1692                }
1693
1694                spin_lock_irqsave(&srq->rq.lock, flags);
1695                wq = srq->rq.wq;
1696                next = wq->head + 1;
1697                if (next >= srq->rq.size)
1698                        next = 0;
1699                if (next == wq->tail) {
1700                        spin_unlock_irqrestore(&srq->rq.lock, flags);
1701                        *bad_wr = wr;
1702                        return -ENOMEM;
1703                }
1704
1705                wqe = rvt_get_rwqe_ptr(&srq->rq, wq->head);
1706                wqe->wr_id = wr->wr_id;
1707                wqe->num_sge = wr->num_sge;
1708                for (i = 0; i < wr->num_sge; i++)
1709                        wqe->sg_list[i] = wr->sg_list[i];
1710                /* Make sure queue entry is written before the head index. */
1711                smp_wmb();
1712                wq->head = next;
1713                spin_unlock_irqrestore(&srq->rq.lock, flags);
1714        }
1715        return 0;
1716}
1717