linux/drivers/infiniband/hw/hfi1/verbs.c
<<
>>
Prefs
   1/*
   2 * Copyright(c) 2015 - 2018 Intel Corporation.
   3 *
   4 * This file is provided under a dual BSD/GPLv2 license.  When using or
   5 * redistributing this file, you may do so under either license.
   6 *
   7 * GPL LICENSE SUMMARY
   8 *
   9 * This program is free software; you can redistribute it and/or modify
  10 * it under the terms of version 2 of the GNU General Public License as
  11 * published by the Free Software Foundation.
  12 *
  13 * This program is distributed in the hope that it will be useful, but
  14 * WITHOUT ANY WARRANTY; without even the implied warranty of
  15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16 * General Public License for more details.
  17 *
  18 * BSD LICENSE
  19 *
  20 * Redistribution and use in source and binary forms, with or without
  21 * modification, are permitted provided that the following conditions
  22 * are met:
  23 *
  24 *  - Redistributions of source code must retain the above copyright
  25 *    notice, this list of conditions and the following disclaimer.
  26 *  - Redistributions in binary form must reproduce the above copyright
  27 *    notice, this list of conditions and the following disclaimer in
  28 *    the documentation and/or other materials provided with the
  29 *    distribution.
  30 *  - Neither the name of Intel Corporation nor the names of its
  31 *    contributors may be used to endorse or promote products derived
  32 *    from this software without specific prior written permission.
  33 *
  34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  45 *
  46 */
  47
  48#include <rdma/ib_mad.h>
  49#include <rdma/ib_user_verbs.h>
  50#include <linux/io.h>
  51#include <linux/module.h>
  52#include <linux/utsname.h>
  53#include <linux/rculist.h>
  54#include <linux/mm.h>
  55#include <linux/vmalloc.h>
  56#include <rdma/opa_addr.h>
  57#include <linux/nospec.h>
  58
  59#include "hfi.h"
  60#include "common.h"
  61#include "device.h"
  62#include "trace.h"
  63#include "qp.h"
  64#include "verbs_txreq.h"
  65#include "debugfs.h"
  66#include "vnic.h"
  67#include "fault.h"
  68#include "affinity.h"
  69
  70static unsigned int hfi1_lkey_table_size = 16;
  71module_param_named(lkey_table_size, hfi1_lkey_table_size, uint,
  72                   S_IRUGO);
  73MODULE_PARM_DESC(lkey_table_size,
  74                 "LKEY table size in bits (2^n, 1 <= n <= 23)");
  75
  76static unsigned int hfi1_max_pds = 0xFFFF;
  77module_param_named(max_pds, hfi1_max_pds, uint, S_IRUGO);
  78MODULE_PARM_DESC(max_pds,
  79                 "Maximum number of protection domains to support");
  80
  81static unsigned int hfi1_max_ahs = 0xFFFF;
  82module_param_named(max_ahs, hfi1_max_ahs, uint, S_IRUGO);
  83MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support");
  84
  85unsigned int hfi1_max_cqes = 0x2FFFFF;
  86module_param_named(max_cqes, hfi1_max_cqes, uint, S_IRUGO);
  87MODULE_PARM_DESC(max_cqes,
  88                 "Maximum number of completion queue entries to support");
  89
  90unsigned int hfi1_max_cqs = 0x1FFFF;
  91module_param_named(max_cqs, hfi1_max_cqs, uint, S_IRUGO);
  92MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support");
  93
  94unsigned int hfi1_max_qp_wrs = 0x3FFF;
  95module_param_named(max_qp_wrs, hfi1_max_qp_wrs, uint, S_IRUGO);
  96MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support");
  97
  98unsigned int hfi1_max_qps = 32768;
  99module_param_named(max_qps, hfi1_max_qps, uint, S_IRUGO);
 100MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support");
 101
 102unsigned int hfi1_max_sges = 0x60;
 103module_param_named(max_sges, hfi1_max_sges, uint, S_IRUGO);
 104MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support");
 105
 106unsigned int hfi1_max_mcast_grps = 16384;
 107module_param_named(max_mcast_grps, hfi1_max_mcast_grps, uint, S_IRUGO);
 108MODULE_PARM_DESC(max_mcast_grps,
 109                 "Maximum number of multicast groups to support");
 110
 111unsigned int hfi1_max_mcast_qp_attached = 16;
 112module_param_named(max_mcast_qp_attached, hfi1_max_mcast_qp_attached,
 113                   uint, S_IRUGO);
 114MODULE_PARM_DESC(max_mcast_qp_attached,
 115                 "Maximum number of attached QPs to support");
 116
 117unsigned int hfi1_max_srqs = 1024;
 118module_param_named(max_srqs, hfi1_max_srqs, uint, S_IRUGO);
 119MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support");
 120
 121unsigned int hfi1_max_srq_sges = 128;
 122module_param_named(max_srq_sges, hfi1_max_srq_sges, uint, S_IRUGO);
 123MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support");
 124
 125unsigned int hfi1_max_srq_wrs = 0x1FFFF;
 126module_param_named(max_srq_wrs, hfi1_max_srq_wrs, uint, S_IRUGO);
 127MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support");
 128
 129unsigned short piothreshold = 256;
 130module_param(piothreshold, ushort, S_IRUGO);
 131MODULE_PARM_DESC(piothreshold, "size used to determine sdma vs. pio");
 132
 133static unsigned int sge_copy_mode;
 134module_param(sge_copy_mode, uint, S_IRUGO);
 135MODULE_PARM_DESC(sge_copy_mode,
 136                 "Verbs copy mode: 0 use memcpy, 1 use cacheless copy, 2 adapt based on WSS");
 137
 138static void verbs_sdma_complete(
 139        struct sdma_txreq *cookie,
 140        int status);
 141
 142static int pio_wait(struct rvt_qp *qp,
 143                    struct send_context *sc,
 144                    struct hfi1_pkt_state *ps,
 145                    u32 flag);
 146
 147/* Length of buffer to create verbs txreq cache name */
 148#define TXREQ_NAME_LEN 24
 149
 150/* 16B trailing buffer */
 151static const u8 trail_buf[MAX_16B_PADDING];
 152
 153static uint wss_threshold = 80;
 154module_param(wss_threshold, uint, S_IRUGO);
 155MODULE_PARM_DESC(wss_threshold, "Percentage (1-100) of LLC to use as a threshold for a cacheless copy");
 156static uint wss_clean_period = 256;
 157module_param(wss_clean_period, uint, S_IRUGO);
 158MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the page copy table is cleaned");
 159
 160/*
 161 * Translate ib_wr_opcode into ib_wc_opcode.
 162 */
 163const enum ib_wc_opcode ib_hfi1_wc_opcode[] = {
 164        [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
 165        [IB_WR_TID_RDMA_WRITE] = IB_WC_RDMA_WRITE,
 166        [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
 167        [IB_WR_SEND] = IB_WC_SEND,
 168        [IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
 169        [IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
 170        [IB_WR_TID_RDMA_READ] = IB_WC_RDMA_READ,
 171        [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
 172        [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD,
 173        [IB_WR_SEND_WITH_INV] = IB_WC_SEND,
 174        [IB_WR_LOCAL_INV] = IB_WC_LOCAL_INV,
 175        [IB_WR_REG_MR] = IB_WC_REG_MR
 176};
 177
 178/*
 179 * Length of header by opcode, 0 --> not supported
 180 */
 181const u8 hdr_len_by_opcode[256] = {
 182        /* RC */
 183        [IB_OPCODE_RC_SEND_FIRST]                     = 12 + 8,
 184        [IB_OPCODE_RC_SEND_MIDDLE]                    = 12 + 8,
 185        [IB_OPCODE_RC_SEND_LAST]                      = 12 + 8,
 186        [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE]       = 12 + 8 + 4,
 187        [IB_OPCODE_RC_SEND_ONLY]                      = 12 + 8,
 188        [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 4,
 189        [IB_OPCODE_RC_RDMA_WRITE_FIRST]               = 12 + 8 + 16,
 190        [IB_OPCODE_RC_RDMA_WRITE_MIDDLE]              = 12 + 8,
 191        [IB_OPCODE_RC_RDMA_WRITE_LAST]                = 12 + 8,
 192        [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
 193        [IB_OPCODE_RC_RDMA_WRITE_ONLY]                = 12 + 8 + 16,
 194        [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20,
 195        [IB_OPCODE_RC_RDMA_READ_REQUEST]              = 12 + 8 + 16,
 196        [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST]       = 12 + 8 + 4,
 197        [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE]      = 12 + 8,
 198        [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST]        = 12 + 8 + 4,
 199        [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY]        = 12 + 8 + 4,
 200        [IB_OPCODE_RC_ACKNOWLEDGE]                    = 12 + 8 + 4,
 201        [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE]             = 12 + 8 + 4 + 8,
 202        [IB_OPCODE_RC_COMPARE_SWAP]                   = 12 + 8 + 28,
 203        [IB_OPCODE_RC_FETCH_ADD]                      = 12 + 8 + 28,
 204        [IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE]      = 12 + 8 + 4,
 205        [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE]      = 12 + 8 + 4,
 206        [IB_OPCODE_TID_RDMA_READ_REQ]                 = 12 + 8 + 36,
 207        [IB_OPCODE_TID_RDMA_READ_RESP]                = 12 + 8 + 36,
 208        [IB_OPCODE_TID_RDMA_WRITE_REQ]                = 12 + 8 + 36,
 209        [IB_OPCODE_TID_RDMA_WRITE_RESP]               = 12 + 8 + 36,
 210        [IB_OPCODE_TID_RDMA_WRITE_DATA]               = 12 + 8 + 36,
 211        [IB_OPCODE_TID_RDMA_WRITE_DATA_LAST]          = 12 + 8 + 36,
 212        [IB_OPCODE_TID_RDMA_ACK]                      = 12 + 8 + 36,
 213        [IB_OPCODE_TID_RDMA_RESYNC]                   = 12 + 8 + 36,
 214        /* UC */
 215        [IB_OPCODE_UC_SEND_FIRST]                     = 12 + 8,
 216        [IB_OPCODE_UC_SEND_MIDDLE]                    = 12 + 8,
 217        [IB_OPCODE_UC_SEND_LAST]                      = 12 + 8,
 218        [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE]       = 12 + 8 + 4,
 219        [IB_OPCODE_UC_SEND_ONLY]                      = 12 + 8,
 220        [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 4,
 221        [IB_OPCODE_UC_RDMA_WRITE_FIRST]               = 12 + 8 + 16,
 222        [IB_OPCODE_UC_RDMA_WRITE_MIDDLE]              = 12 + 8,
 223        [IB_OPCODE_UC_RDMA_WRITE_LAST]                = 12 + 8,
 224        [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
 225        [IB_OPCODE_UC_RDMA_WRITE_ONLY]                = 12 + 8 + 16,
 226        [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20,
 227        /* UD */
 228        [IB_OPCODE_UD_SEND_ONLY]                      = 12 + 8 + 8,
 229        [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 12
 230};
 231
 232static const opcode_handler opcode_handler_tbl[256] = {
 233        /* RC */
 234        [IB_OPCODE_RC_SEND_FIRST]                     = &hfi1_rc_rcv,
 235        [IB_OPCODE_RC_SEND_MIDDLE]                    = &hfi1_rc_rcv,
 236        [IB_OPCODE_RC_SEND_LAST]                      = &hfi1_rc_rcv,
 237        [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE]       = &hfi1_rc_rcv,
 238        [IB_OPCODE_RC_SEND_ONLY]                      = &hfi1_rc_rcv,
 239        [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_rc_rcv,
 240        [IB_OPCODE_RC_RDMA_WRITE_FIRST]               = &hfi1_rc_rcv,
 241        [IB_OPCODE_RC_RDMA_WRITE_MIDDLE]              = &hfi1_rc_rcv,
 242        [IB_OPCODE_RC_RDMA_WRITE_LAST]                = &hfi1_rc_rcv,
 243        [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_rc_rcv,
 244        [IB_OPCODE_RC_RDMA_WRITE_ONLY]                = &hfi1_rc_rcv,
 245        [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_rc_rcv,
 246        [IB_OPCODE_RC_RDMA_READ_REQUEST]              = &hfi1_rc_rcv,
 247        [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST]       = &hfi1_rc_rcv,
 248        [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE]      = &hfi1_rc_rcv,
 249        [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST]        = &hfi1_rc_rcv,
 250        [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY]        = &hfi1_rc_rcv,
 251        [IB_OPCODE_RC_ACKNOWLEDGE]                    = &hfi1_rc_rcv,
 252        [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE]             = &hfi1_rc_rcv,
 253        [IB_OPCODE_RC_COMPARE_SWAP]                   = &hfi1_rc_rcv,
 254        [IB_OPCODE_RC_FETCH_ADD]                      = &hfi1_rc_rcv,
 255        [IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE]      = &hfi1_rc_rcv,
 256        [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE]      = &hfi1_rc_rcv,
 257
 258        /* TID RDMA has separate handlers for different opcodes.*/
 259        [IB_OPCODE_TID_RDMA_WRITE_REQ]       = &hfi1_rc_rcv_tid_rdma_write_req,
 260        [IB_OPCODE_TID_RDMA_WRITE_RESP]      = &hfi1_rc_rcv_tid_rdma_write_resp,
 261        [IB_OPCODE_TID_RDMA_WRITE_DATA]      = &hfi1_rc_rcv_tid_rdma_write_data,
 262        [IB_OPCODE_TID_RDMA_WRITE_DATA_LAST] = &hfi1_rc_rcv_tid_rdma_write_data,
 263        [IB_OPCODE_TID_RDMA_READ_REQ]        = &hfi1_rc_rcv_tid_rdma_read_req,
 264        [IB_OPCODE_TID_RDMA_READ_RESP]       = &hfi1_rc_rcv_tid_rdma_read_resp,
 265        [IB_OPCODE_TID_RDMA_RESYNC]          = &hfi1_rc_rcv_tid_rdma_resync,
 266        [IB_OPCODE_TID_RDMA_ACK]             = &hfi1_rc_rcv_tid_rdma_ack,
 267
 268        /* UC */
 269        [IB_OPCODE_UC_SEND_FIRST]                     = &hfi1_uc_rcv,
 270        [IB_OPCODE_UC_SEND_MIDDLE]                    = &hfi1_uc_rcv,
 271        [IB_OPCODE_UC_SEND_LAST]                      = &hfi1_uc_rcv,
 272        [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE]       = &hfi1_uc_rcv,
 273        [IB_OPCODE_UC_SEND_ONLY]                      = &hfi1_uc_rcv,
 274        [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_uc_rcv,
 275        [IB_OPCODE_UC_RDMA_WRITE_FIRST]               = &hfi1_uc_rcv,
 276        [IB_OPCODE_UC_RDMA_WRITE_MIDDLE]              = &hfi1_uc_rcv,
 277        [IB_OPCODE_UC_RDMA_WRITE_LAST]                = &hfi1_uc_rcv,
 278        [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_uc_rcv,
 279        [IB_OPCODE_UC_RDMA_WRITE_ONLY]                = &hfi1_uc_rcv,
 280        [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_uc_rcv,
 281        /* UD */
 282        [IB_OPCODE_UD_SEND_ONLY]                      = &hfi1_ud_rcv,
 283        [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_ud_rcv,
 284        /* CNP */
 285        [IB_OPCODE_CNP]                               = &hfi1_cnp_rcv
 286};
 287
 288#define OPMASK 0x1f
 289
 290static const u32 pio_opmask[BIT(3)] = {
 291        /* RC */
 292        [IB_OPCODE_RC >> 5] =
 293                BIT(RC_OP(SEND_ONLY) & OPMASK) |
 294                BIT(RC_OP(SEND_ONLY_WITH_IMMEDIATE) & OPMASK) |
 295                BIT(RC_OP(RDMA_WRITE_ONLY) & OPMASK) |
 296                BIT(RC_OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE) & OPMASK) |
 297                BIT(RC_OP(RDMA_READ_REQUEST) & OPMASK) |
 298                BIT(RC_OP(ACKNOWLEDGE) & OPMASK) |
 299                BIT(RC_OP(ATOMIC_ACKNOWLEDGE) & OPMASK) |
 300                BIT(RC_OP(COMPARE_SWAP) & OPMASK) |
 301                BIT(RC_OP(FETCH_ADD) & OPMASK),
 302        /* UC */
 303        [IB_OPCODE_UC >> 5] =
 304                BIT(UC_OP(SEND_ONLY) & OPMASK) |
 305                BIT(UC_OP(SEND_ONLY_WITH_IMMEDIATE) & OPMASK) |
 306                BIT(UC_OP(RDMA_WRITE_ONLY) & OPMASK) |
 307                BIT(UC_OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE) & OPMASK),
 308};
 309
 310/*
 311 * System image GUID.
 312 */
 313__be64 ib_hfi1_sys_image_guid;
 314
 315/*
 316 * Make sure the QP is ready and able to accept the given opcode.
 317 */
 318static inline opcode_handler qp_ok(struct hfi1_packet *packet)
 319{
 320        if (!(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK))
 321                return NULL;
 322        if (((packet->opcode & RVT_OPCODE_QP_MASK) ==
 323             packet->qp->allowed_ops) ||
 324            (packet->opcode == IB_OPCODE_CNP))
 325                return opcode_handler_tbl[packet->opcode];
 326
 327        return NULL;
 328}
 329
 330static u64 hfi1_fault_tx(struct rvt_qp *qp, u8 opcode, u64 pbc)
 331{
 332#ifdef CONFIG_FAULT_INJECTION
 333        if ((opcode & IB_OPCODE_MSP) == IB_OPCODE_MSP) {
 334                /*
 335                 * In order to drop non-IB traffic we
 336                 * set PbcInsertHrc to NONE (0x2).
 337                 * The packet will still be delivered
 338                 * to the receiving node but a
 339                 * KHdrHCRCErr (KDETH packet with a bad
 340                 * HCRC) will be triggered and the
 341                 * packet will not be delivered to the
 342                 * correct context.
 343                 */
 344                pbc &= ~PBC_INSERT_HCRC_SMASK;
 345                pbc |= (u64)PBC_IHCRC_NONE << PBC_INSERT_HCRC_SHIFT;
 346        } else {
 347                /*
 348                 * In order to drop regular verbs
 349                 * traffic we set the PbcTestEbp
 350                 * flag. The packet will still be
 351                 * delivered to the receiving node but
 352                 * a 'late ebp error' will be
 353                 * triggered and will be dropped.
 354                 */
 355                pbc |= PBC_TEST_EBP;
 356        }
 357#endif
 358        return pbc;
 359}
 360
 361static opcode_handler tid_qp_ok(int opcode, struct hfi1_packet *packet)
 362{
 363        if (packet->qp->ibqp.qp_type != IB_QPT_RC ||
 364            !(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK))
 365                return NULL;
 366        if ((opcode & RVT_OPCODE_QP_MASK) == IB_OPCODE_TID_RDMA)
 367                return opcode_handler_tbl[opcode];
 368        return NULL;
 369}
 370
 371void hfi1_kdeth_eager_rcv(struct hfi1_packet *packet)
 372{
 373        struct hfi1_ctxtdata *rcd = packet->rcd;
 374        struct ib_header *hdr = packet->hdr;
 375        u32 tlen = packet->tlen;
 376        struct hfi1_pportdata *ppd = rcd->ppd;
 377        struct hfi1_ibport *ibp = &ppd->ibport_data;
 378        struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi;
 379        opcode_handler opcode_handler;
 380        unsigned long flags;
 381        u32 qp_num;
 382        int lnh;
 383        u8 opcode;
 384
 385        /* DW == LRH (2) + BTH (3) + KDETH (9) + CRC (1) */
 386        if (unlikely(tlen < 15 * sizeof(u32)))
 387                goto drop;
 388
 389        lnh = be16_to_cpu(hdr->lrh[0]) & 3;
 390        if (lnh != HFI1_LRH_BTH)
 391                goto drop;
 392
 393        packet->ohdr = &hdr->u.oth;
 394        trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf)));
 395
 396        opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24);
 397        inc_opstats(tlen, &rcd->opstats->stats[opcode]);
 398
 399        /* verbs_qp can be picked up from any tid_rdma header struct */
 400        qp_num = be32_to_cpu(packet->ohdr->u.tid_rdma.r_req.verbs_qp) &
 401                RVT_QPN_MASK;
 402
 403        rcu_read_lock();
 404        packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
 405        if (!packet->qp)
 406                goto drop_rcu;
 407        spin_lock_irqsave(&packet->qp->r_lock, flags);
 408        opcode_handler = tid_qp_ok(opcode, packet);
 409        if (likely(opcode_handler))
 410                opcode_handler(packet);
 411        else
 412                goto drop_unlock;
 413        spin_unlock_irqrestore(&packet->qp->r_lock, flags);
 414        rcu_read_unlock();
 415
 416        return;
 417drop_unlock:
 418        spin_unlock_irqrestore(&packet->qp->r_lock, flags);
 419drop_rcu:
 420        rcu_read_unlock();
 421drop:
 422        ibp->rvp.n_pkt_drops++;
 423}
 424
 425void hfi1_kdeth_expected_rcv(struct hfi1_packet *packet)
 426{
 427        struct hfi1_ctxtdata *rcd = packet->rcd;
 428        struct ib_header *hdr = packet->hdr;
 429        u32 tlen = packet->tlen;
 430        struct hfi1_pportdata *ppd = rcd->ppd;
 431        struct hfi1_ibport *ibp = &ppd->ibport_data;
 432        struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi;
 433        opcode_handler opcode_handler;
 434        unsigned long flags;
 435        u32 qp_num;
 436        int lnh;
 437        u8 opcode;
 438
 439        /* DW == LRH (2) + BTH (3) + KDETH (9) + CRC (1) */
 440        if (unlikely(tlen < 15 * sizeof(u32)))
 441                goto drop;
 442
 443        lnh = be16_to_cpu(hdr->lrh[0]) & 3;
 444        if (lnh != HFI1_LRH_BTH)
 445                goto drop;
 446
 447        packet->ohdr = &hdr->u.oth;
 448        trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf)));
 449
 450        opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24);
 451        inc_opstats(tlen, &rcd->opstats->stats[opcode]);
 452
 453        /* verbs_qp can be picked up from any tid_rdma header struct */
 454        qp_num = be32_to_cpu(packet->ohdr->u.tid_rdma.r_rsp.verbs_qp) &
 455                RVT_QPN_MASK;
 456
 457        rcu_read_lock();
 458        packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
 459        if (!packet->qp)
 460                goto drop_rcu;
 461        spin_lock_irqsave(&packet->qp->r_lock, flags);
 462        opcode_handler = tid_qp_ok(opcode, packet);
 463        if (likely(opcode_handler))
 464                opcode_handler(packet);
 465        else
 466                goto drop_unlock;
 467        spin_unlock_irqrestore(&packet->qp->r_lock, flags);
 468        rcu_read_unlock();
 469
 470        return;
 471drop_unlock:
 472        spin_unlock_irqrestore(&packet->qp->r_lock, flags);
 473drop_rcu:
 474        rcu_read_unlock();
 475drop:
 476        ibp->rvp.n_pkt_drops++;
 477}
 478
 479static int hfi1_do_pkey_check(struct hfi1_packet *packet)
 480{
 481        struct hfi1_ctxtdata *rcd = packet->rcd;
 482        struct hfi1_pportdata *ppd = rcd->ppd;
 483        struct hfi1_16b_header *hdr = packet->hdr;
 484        u16 pkey;
 485
 486        /* Pkey check needed only for bypass packets */
 487        if (packet->etype != RHF_RCV_TYPE_BYPASS)
 488                return 0;
 489
 490        /* Perform pkey check */
 491        pkey = hfi1_16B_get_pkey(hdr);
 492        return ingress_pkey_check(ppd, pkey, packet->sc,
 493                                  packet->qp->s_pkey_index,
 494                                  packet->slid, true);
 495}
 496
 497static inline void hfi1_handle_packet(struct hfi1_packet *packet,
 498                                      bool is_mcast)
 499{
 500        u32 qp_num;
 501        struct hfi1_ctxtdata *rcd = packet->rcd;
 502        struct hfi1_pportdata *ppd = rcd->ppd;
 503        struct hfi1_ibport *ibp = rcd_to_iport(rcd);
 504        struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi;
 505        opcode_handler packet_handler;
 506        unsigned long flags;
 507
 508        inc_opstats(packet->tlen, &rcd->opstats->stats[packet->opcode]);
 509
 510        if (unlikely(is_mcast)) {
 511                struct rvt_mcast *mcast;
 512                struct rvt_mcast_qp *p;
 513
 514                if (!packet->grh)
 515                        goto drop;
 516                mcast = rvt_mcast_find(&ibp->rvp,
 517                                       &packet->grh->dgid,
 518                                       opa_get_lid(packet->dlid, 9B));
 519                if (!mcast)
 520                        goto drop;
 521                list_for_each_entry_rcu(p, &mcast->qp_list, list) {
 522                        packet->qp = p->qp;
 523                        if (hfi1_do_pkey_check(packet))
 524                                goto drop;
 525                        spin_lock_irqsave(&packet->qp->r_lock, flags);
 526                        packet_handler = qp_ok(packet);
 527                        if (likely(packet_handler))
 528                                packet_handler(packet);
 529                        else
 530                                ibp->rvp.n_pkt_drops++;
 531                        spin_unlock_irqrestore(&packet->qp->r_lock, flags);
 532                }
 533                /*
 534                 * Notify rvt_multicast_detach() if it is waiting for us
 535                 * to finish.
 536                 */
 537                if (atomic_dec_return(&mcast->refcount) <= 1)
 538                        wake_up(&mcast->wait);
 539        } else {
 540                /* Get the destination QP number. */
 541                if (packet->etype == RHF_RCV_TYPE_BYPASS &&
 542                    hfi1_16B_get_l4(packet->hdr) == OPA_16B_L4_FM)
 543                        qp_num = hfi1_16B_get_dest_qpn(packet->mgmt);
 544                else
 545                        qp_num = ib_bth_get_qpn(packet->ohdr);
 546
 547                rcu_read_lock();
 548                packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
 549                if (!packet->qp)
 550                        goto unlock_drop;
 551
 552                if (hfi1_do_pkey_check(packet))
 553                        goto unlock_drop;
 554
 555                spin_lock_irqsave(&packet->qp->r_lock, flags);
 556                packet_handler = qp_ok(packet);
 557                if (likely(packet_handler))
 558                        packet_handler(packet);
 559                else
 560                        ibp->rvp.n_pkt_drops++;
 561                spin_unlock_irqrestore(&packet->qp->r_lock, flags);
 562                rcu_read_unlock();
 563        }
 564        return;
 565unlock_drop:
 566        rcu_read_unlock();
 567drop:
 568        ibp->rvp.n_pkt_drops++;
 569}
 570
 571/**
 572 * hfi1_ib_rcv - process an incoming packet
 573 * @packet: data packet information
 574 *
 575 * This is called to process an incoming packet at interrupt level.
 576 */
 577void hfi1_ib_rcv(struct hfi1_packet *packet)
 578{
 579        struct hfi1_ctxtdata *rcd = packet->rcd;
 580
 581        trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf)));
 582        hfi1_handle_packet(packet, hfi1_check_mcast(packet->dlid));
 583}
 584
 585void hfi1_16B_rcv(struct hfi1_packet *packet)
 586{
 587        struct hfi1_ctxtdata *rcd = packet->rcd;
 588
 589        trace_input_ibhdr(rcd->dd, packet, false);
 590        hfi1_handle_packet(packet, hfi1_check_mcast(packet->dlid));
 591}
 592
 593/*
 594 * This is called from a timer to check for QPs
 595 * which need kernel memory in order to send a packet.
 596 */
 597static void mem_timer(struct timer_list *t)
 598{
 599        struct hfi1_ibdev *dev = from_timer(dev, t, mem_timer);
 600        struct list_head *list = &dev->memwait;
 601        struct rvt_qp *qp = NULL;
 602        struct iowait *wait;
 603        unsigned long flags;
 604        struct hfi1_qp_priv *priv;
 605
 606        write_seqlock_irqsave(&dev->iowait_lock, flags);
 607        if (!list_empty(list)) {
 608                wait = list_first_entry(list, struct iowait, list);
 609                qp = iowait_to_qp(wait);
 610                priv = qp->priv;
 611                list_del_init(&priv->s_iowait.list);
 612                priv->s_iowait.lock = NULL;
 613                /* refcount held until actual wake up */
 614                if (!list_empty(list))
 615                        mod_timer(&dev->mem_timer, jiffies + 1);
 616        }
 617        write_sequnlock_irqrestore(&dev->iowait_lock, flags);
 618
 619        if (qp)
 620                hfi1_qp_wakeup(qp, RVT_S_WAIT_KMEM);
 621}
 622
 623/*
 624 * This is called with progress side lock held.
 625 */
 626/* New API */
 627static void verbs_sdma_complete(
 628        struct sdma_txreq *cookie,
 629        int status)
 630{
 631        struct verbs_txreq *tx =
 632                container_of(cookie, struct verbs_txreq, txreq);
 633        struct rvt_qp *qp = tx->qp;
 634
 635        spin_lock(&qp->s_lock);
 636        if (tx->wqe) {
 637                rvt_send_complete(qp, tx->wqe, IB_WC_SUCCESS);
 638        } else if (qp->ibqp.qp_type == IB_QPT_RC) {
 639                struct hfi1_opa_header *hdr;
 640
 641                hdr = &tx->phdr.hdr;
 642                if (unlikely(status == SDMA_TXREQ_S_ABORTED))
 643                        hfi1_rc_verbs_aborted(qp, hdr);
 644                hfi1_rc_send_complete(qp, hdr);
 645        }
 646        spin_unlock(&qp->s_lock);
 647
 648        hfi1_put_txreq(tx);
 649}
 650
 651void hfi1_wait_kmem(struct rvt_qp *qp)
 652{
 653        struct hfi1_qp_priv *priv = qp->priv;
 654        struct ib_qp *ibqp = &qp->ibqp;
 655        struct ib_device *ibdev = ibqp->device;
 656        struct hfi1_ibdev *dev = to_idev(ibdev);
 657
 658        if (list_empty(&priv->s_iowait.list)) {
 659                if (list_empty(&dev->memwait))
 660                        mod_timer(&dev->mem_timer, jiffies + 1);
 661                qp->s_flags |= RVT_S_WAIT_KMEM;
 662                list_add_tail(&priv->s_iowait.list, &dev->memwait);
 663                priv->s_iowait.lock = &dev->iowait_lock;
 664                trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM);
 665                rvt_get_qp(qp);
 666        }
 667}
 668
 669static int wait_kmem(struct hfi1_ibdev *dev,
 670                     struct rvt_qp *qp,
 671                     struct hfi1_pkt_state *ps)
 672{
 673        unsigned long flags;
 674        int ret = 0;
 675
 676        spin_lock_irqsave(&qp->s_lock, flags);
 677        if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
 678                write_seqlock(&dev->iowait_lock);
 679                list_add_tail(&ps->s_txreq->txreq.list,
 680                              &ps->wait->tx_head);
 681                hfi1_wait_kmem(qp);
 682                write_sequnlock(&dev->iowait_lock);
 683                hfi1_qp_unbusy(qp, ps->wait);
 684                ret = -EBUSY;
 685        }
 686        spin_unlock_irqrestore(&qp->s_lock, flags);
 687
 688        return ret;
 689}
 690
 691/*
 692 * This routine calls txadds for each sg entry.
 693 *
 694 * Add failures will revert the sge cursor
 695 */
 696static noinline int build_verbs_ulp_payload(
 697        struct sdma_engine *sde,
 698        u32 length,
 699        struct verbs_txreq *tx)
 700{
 701        struct rvt_sge_state *ss = tx->ss;
 702        struct rvt_sge *sg_list = ss->sg_list;
 703        struct rvt_sge sge = ss->sge;
 704        u8 num_sge = ss->num_sge;
 705        u32 len;
 706        int ret = 0;
 707
 708        while (length) {
 709                len = rvt_get_sge_length(&ss->sge, length);
 710                WARN_ON_ONCE(len == 0);
 711                ret = sdma_txadd_kvaddr(
 712                        sde->dd,
 713                        &tx->txreq,
 714                        ss->sge.vaddr,
 715                        len);
 716                if (ret)
 717                        goto bail_txadd;
 718                rvt_update_sge(ss, len, false);
 719                length -= len;
 720        }
 721        return ret;
 722bail_txadd:
 723        /* unwind cursor */
 724        ss->sge = sge;
 725        ss->num_sge = num_sge;
 726        ss->sg_list = sg_list;
 727        return ret;
 728}
 729
 730/**
 731 * update_tx_opstats - record stats by opcode
 732 * @qp; the qp
 733 * @ps: transmit packet state
 734 * @plen: the plen in dwords
 735 *
 736 * This is a routine to record the tx opstats after a
 737 * packet has been presented to the egress mechanism.
 738 */
 739static void update_tx_opstats(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 740                              u32 plen)
 741{
 742#ifdef CONFIG_DEBUG_FS
 743        struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
 744        struct hfi1_opcode_stats_perctx *s = get_cpu_ptr(dd->tx_opstats);
 745
 746        inc_opstats(plen * 4, &s->stats[ps->opcode]);
 747        put_cpu_ptr(s);
 748#endif
 749}
 750
 751/*
 752 * Build the number of DMA descriptors needed to send length bytes of data.
 753 *
 754 * NOTE: DMA mapping is held in the tx until completed in the ring or
 755 *       the tx desc is freed without having been submitted to the ring
 756 *
 757 * This routine ensures all the helper routine calls succeed.
 758 */
 759/* New API */
 760static int build_verbs_tx_desc(
 761        struct sdma_engine *sde,
 762        u32 length,
 763        struct verbs_txreq *tx,
 764        struct hfi1_ahg_info *ahg_info,
 765        u64 pbc)
 766{
 767        int ret = 0;
 768        struct hfi1_sdma_header *phdr = &tx->phdr;
 769        u16 hdrbytes = (tx->hdr_dwords + sizeof(pbc) / 4) << 2;
 770        u8 extra_bytes = 0;
 771
 772        if (tx->phdr.hdr.hdr_type) {
 773                /*
 774                 * hdrbytes accounts for PBC. Need to subtract 8 bytes
 775                 * before calculating padding.
 776                 */
 777                extra_bytes = hfi1_get_16b_padding(hdrbytes - 8, length) +
 778                              (SIZE_OF_CRC << 2) + SIZE_OF_LT;
 779        }
 780        if (!ahg_info->ahgcount) {
 781                ret = sdma_txinit_ahg(
 782                        &tx->txreq,
 783                        ahg_info->tx_flags,
 784                        hdrbytes + length +
 785                        extra_bytes,
 786                        ahg_info->ahgidx,
 787                        0,
 788                        NULL,
 789                        0,
 790                        verbs_sdma_complete);
 791                if (ret)
 792                        goto bail_txadd;
 793                phdr->pbc = cpu_to_le64(pbc);
 794                ret = sdma_txadd_kvaddr(
 795                        sde->dd,
 796                        &tx->txreq,
 797                        phdr,
 798                        hdrbytes);
 799                if (ret)
 800                        goto bail_txadd;
 801        } else {
 802                ret = sdma_txinit_ahg(
 803                        &tx->txreq,
 804                        ahg_info->tx_flags,
 805                        length,
 806                        ahg_info->ahgidx,
 807                        ahg_info->ahgcount,
 808                        ahg_info->ahgdesc,
 809                        hdrbytes,
 810                        verbs_sdma_complete);
 811                if (ret)
 812                        goto bail_txadd;
 813        }
 814        /* add the ulp payload - if any. tx->ss can be NULL for acks */
 815        if (tx->ss) {
 816                ret = build_verbs_ulp_payload(sde, length, tx);
 817                if (ret)
 818                        goto bail_txadd;
 819        }
 820
 821        /* add icrc, lt byte, and padding to flit */
 822        if (extra_bytes)
 823                ret = sdma_txadd_kvaddr(sde->dd, &tx->txreq,
 824                                        (void *)trail_buf, extra_bytes);
 825
 826bail_txadd:
 827        return ret;
 828}
 829
 830static u64 update_hcrc(u8 opcode, u64 pbc)
 831{
 832        if ((opcode & IB_OPCODE_TID_RDMA) == IB_OPCODE_TID_RDMA) {
 833                pbc &= ~PBC_INSERT_HCRC_SMASK;
 834                pbc |= (u64)PBC_IHCRC_LKDETH << PBC_INSERT_HCRC_SHIFT;
 835        }
 836        return pbc;
 837}
 838
 839int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 840                        u64 pbc)
 841{
 842        struct hfi1_qp_priv *priv = qp->priv;
 843        struct hfi1_ahg_info *ahg_info = priv->s_ahg;
 844        u32 hdrwords = ps->s_txreq->hdr_dwords;
 845        u32 len = ps->s_txreq->s_cur_size;
 846        u32 plen;
 847        struct hfi1_ibdev *dev = ps->dev;
 848        struct hfi1_pportdata *ppd = ps->ppd;
 849        struct verbs_txreq *tx;
 850        u8 sc5 = priv->s_sc;
 851        int ret;
 852        u32 dwords;
 853
 854        if (ps->s_txreq->phdr.hdr.hdr_type) {
 855                u8 extra_bytes = hfi1_get_16b_padding((hdrwords << 2), len);
 856
 857                dwords = (len + extra_bytes + (SIZE_OF_CRC << 2) +
 858                          SIZE_OF_LT) >> 2;
 859        } else {
 860                dwords = (len + 3) >> 2;
 861        }
 862        plen = hdrwords + dwords + sizeof(pbc) / 4;
 863
 864        tx = ps->s_txreq;
 865        if (!sdma_txreq_built(&tx->txreq)) {
 866                if (likely(pbc == 0)) {
 867                        u32 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5);
 868
 869                        /* No vl15 here */
 870                        /* set PBC_DC_INFO bit (aka SC[4]) in pbc */
 871                        if (ps->s_txreq->phdr.hdr.hdr_type)
 872                                pbc |= PBC_PACKET_BYPASS |
 873                                       PBC_INSERT_BYPASS_ICRC;
 874                        else
 875                                pbc |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT);
 876
 877                        if (unlikely(hfi1_dbg_should_fault_tx(qp, ps->opcode)))
 878                                pbc = hfi1_fault_tx(qp, ps->opcode, pbc);
 879                        pbc = create_pbc(ppd,
 880                                         pbc,
 881                                         qp->srate_mbps,
 882                                         vl,
 883                                         plen);
 884
 885                        /* Update HCRC based on packet opcode */
 886                        pbc = update_hcrc(ps->opcode, pbc);
 887                }
 888                tx->wqe = qp->s_wqe;
 889                ret = build_verbs_tx_desc(tx->sde, len, tx, ahg_info, pbc);
 890                if (unlikely(ret))
 891                        goto bail_build;
 892        }
 893        ret =  sdma_send_txreq(tx->sde, ps->wait, &tx->txreq, ps->pkts_sent);
 894        if (unlikely(ret < 0)) {
 895                if (ret == -ECOMM)
 896                        goto bail_ecomm;
 897                return ret;
 898        }
 899
 900        update_tx_opstats(qp, ps, plen);
 901        trace_sdma_output_ibhdr(dd_from_ibdev(qp->ibqp.device),
 902                                &ps->s_txreq->phdr.hdr, ib_is_sc5(sc5));
 903        return ret;
 904
 905bail_ecomm:
 906        /* The current one got "sent" */
 907        return 0;
 908bail_build:
 909        ret = wait_kmem(dev, qp, ps);
 910        if (!ret) {
 911                /* free txreq - bad state */
 912                hfi1_put_txreq(ps->s_txreq);
 913                ps->s_txreq = NULL;
 914        }
 915        return ret;
 916}
 917
 918/*
 919 * If we are now in the error state, return zero to flush the
 920 * send work request.
 921 */
 922static int pio_wait(struct rvt_qp *qp,
 923                    struct send_context *sc,
 924                    struct hfi1_pkt_state *ps,
 925                    u32 flag)
 926{
 927        struct hfi1_qp_priv *priv = qp->priv;
 928        struct hfi1_devdata *dd = sc->dd;
 929        unsigned long flags;
 930        int ret = 0;
 931
 932        /*
 933         * Note that as soon as want_buffer() is called and
 934         * possibly before it returns, sc_piobufavail()
 935         * could be called. Therefore, put QP on the I/O wait list before
 936         * enabling the PIO avail interrupt.
 937         */
 938        spin_lock_irqsave(&qp->s_lock, flags);
 939        if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
 940                write_seqlock(&sc->waitlock);
 941                list_add_tail(&ps->s_txreq->txreq.list,
 942                              &ps->wait->tx_head);
 943                if (list_empty(&priv->s_iowait.list)) {
 944                        struct hfi1_ibdev *dev = &dd->verbs_dev;
 945                        int was_empty;
 946
 947                        dev->n_piowait += !!(flag & RVT_S_WAIT_PIO);
 948                        dev->n_piodrain += !!(flag & HFI1_S_WAIT_PIO_DRAIN);
 949                        qp->s_flags |= flag;
 950                        was_empty = list_empty(&sc->piowait);
 951                        iowait_get_priority(&priv->s_iowait);
 952                        iowait_queue(ps->pkts_sent, &priv->s_iowait,
 953                                     &sc->piowait);
 954                        priv->s_iowait.lock = &sc->waitlock;
 955                        trace_hfi1_qpsleep(qp, RVT_S_WAIT_PIO);
 956                        rvt_get_qp(qp);
 957                        /* counting: only call wantpiobuf_intr if first user */
 958                        if (was_empty)
 959                                hfi1_sc_wantpiobuf_intr(sc, 1);
 960                }
 961                write_sequnlock(&sc->waitlock);
 962                hfi1_qp_unbusy(qp, ps->wait);
 963                ret = -EBUSY;
 964        }
 965        spin_unlock_irqrestore(&qp->s_lock, flags);
 966        return ret;
 967}
 968
 969static void verbs_pio_complete(void *arg, int code)
 970{
 971        struct rvt_qp *qp = (struct rvt_qp *)arg;
 972        struct hfi1_qp_priv *priv = qp->priv;
 973
 974        if (iowait_pio_dec(&priv->s_iowait))
 975                iowait_drain_wakeup(&priv->s_iowait);
 976}
 977
 978int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 979                        u64 pbc)
 980{
 981        struct hfi1_qp_priv *priv = qp->priv;
 982        u32 hdrwords = ps->s_txreq->hdr_dwords;
 983        struct rvt_sge_state *ss = ps->s_txreq->ss;
 984        u32 len = ps->s_txreq->s_cur_size;
 985        u32 dwords;
 986        u32 plen;
 987        struct hfi1_pportdata *ppd = ps->ppd;
 988        u32 *hdr;
 989        u8 sc5;
 990        unsigned long flags = 0;
 991        struct send_context *sc;
 992        struct pio_buf *pbuf;
 993        int wc_status = IB_WC_SUCCESS;
 994        int ret = 0;
 995        pio_release_cb cb = NULL;
 996        u8 extra_bytes = 0;
 997
 998        if (ps->s_txreq->phdr.hdr.hdr_type) {
 999                u8 pad_size = hfi1_get_16b_padding((hdrwords << 2), len);
1000
1001                extra_bytes = pad_size + (SIZE_OF_CRC << 2) + SIZE_OF_LT;
1002                dwords = (len + extra_bytes) >> 2;
1003                hdr = (u32 *)&ps->s_txreq->phdr.hdr.opah;
1004        } else {
1005                dwords = (len + 3) >> 2;
1006                hdr = (u32 *)&ps->s_txreq->phdr.hdr.ibh;
1007        }
1008        plen = hdrwords + dwords + sizeof(pbc) / 4;
1009
1010        /* only RC/UC use complete */
1011        switch (qp->ibqp.qp_type) {
1012        case IB_QPT_RC:
1013        case IB_QPT_UC:
1014                cb = verbs_pio_complete;
1015                break;
1016        default:
1017                break;
1018        }
1019
1020        /* vl15 special case taken care of in ud.c */
1021        sc5 = priv->s_sc;
1022        sc = ps->s_txreq->psc;
1023
1024        if (likely(pbc == 0)) {
1025                u8 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5);
1026
1027                /* set PBC_DC_INFO bit (aka SC[4]) in pbc */
1028                if (ps->s_txreq->phdr.hdr.hdr_type)
1029                        pbc |= PBC_PACKET_BYPASS | PBC_INSERT_BYPASS_ICRC;
1030                else
1031                        pbc |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT);
1032
1033                if (unlikely(hfi1_dbg_should_fault_tx(qp, ps->opcode)))
1034                        pbc = hfi1_fault_tx(qp, ps->opcode, pbc);
1035                pbc = create_pbc(ppd, pbc, qp->srate_mbps, vl, plen);
1036
1037                /* Update HCRC based on packet opcode */
1038                pbc = update_hcrc(ps->opcode, pbc);
1039        }
1040        if (cb)
1041                iowait_pio_inc(&priv->s_iowait);
1042        pbuf = sc_buffer_alloc(sc, plen, cb, qp);
1043        if (unlikely(IS_ERR_OR_NULL(pbuf))) {
1044                if (cb)
1045                        verbs_pio_complete(qp, 0);
1046                if (IS_ERR(pbuf)) {
1047                        /*
1048                         * If we have filled the PIO buffers to capacity and are
1049                         * not in an active state this request is not going to
1050                         * go out to so just complete it with an error or else a
1051                         * ULP or the core may be stuck waiting.
1052                         */
1053                        hfi1_cdbg(
1054                                PIO,
1055                                "alloc failed. state not active, completing");
1056                        wc_status = IB_WC_GENERAL_ERR;
1057                        goto pio_bail;
1058                } else {
1059                        /*
1060                         * This is a normal occurrence. The PIO buffs are full
1061                         * up but we are still happily sending, well we could be
1062                         * so lets continue to queue the request.
1063                         */
1064                        hfi1_cdbg(PIO, "alloc failed. state active, queuing");
1065                        ret = pio_wait(qp, sc, ps, RVT_S_WAIT_PIO);
1066                        if (!ret)
1067                                /* txreq not queued - free */
1068                                goto bail;
1069                        /* tx consumed in wait */
1070                        return ret;
1071                }
1072        }
1073
1074        if (dwords == 0) {
1075                pio_copy(ppd->dd, pbuf, pbc, hdr, hdrwords);
1076        } else {
1077                seg_pio_copy_start(pbuf, pbc,
1078                                   hdr, hdrwords * 4);
1079                if (ss) {
1080                        while (len) {
1081                                void *addr = ss->sge.vaddr;
1082                                u32 slen = rvt_get_sge_length(&ss->sge, len);
1083
1084                                rvt_update_sge(ss, slen, false);
1085                                seg_pio_copy_mid(pbuf, addr, slen);
1086                                len -= slen;
1087                        }
1088                }
1089                /* add icrc, lt byte, and padding to flit */
1090                if (extra_bytes)
1091                        seg_pio_copy_mid(pbuf, trail_buf, extra_bytes);
1092
1093                seg_pio_copy_end(pbuf);
1094        }
1095
1096        update_tx_opstats(qp, ps, plen);
1097        trace_pio_output_ibhdr(dd_from_ibdev(qp->ibqp.device),
1098                               &ps->s_txreq->phdr.hdr, ib_is_sc5(sc5));
1099
1100pio_bail:
1101        spin_lock_irqsave(&qp->s_lock, flags);
1102        if (qp->s_wqe) {
1103                rvt_send_complete(qp, qp->s_wqe, wc_status);
1104        } else if (qp->ibqp.qp_type == IB_QPT_RC) {
1105                if (unlikely(wc_status == IB_WC_GENERAL_ERR))
1106                        hfi1_rc_verbs_aborted(qp, &ps->s_txreq->phdr.hdr);
1107                hfi1_rc_send_complete(qp, &ps->s_txreq->phdr.hdr);
1108        }
1109        spin_unlock_irqrestore(&qp->s_lock, flags);
1110
1111        ret = 0;
1112
1113bail:
1114        hfi1_put_txreq(ps->s_txreq);
1115        return ret;
1116}
1117
1118/*
1119 * egress_pkey_matches_entry - return 1 if the pkey matches ent (ent
1120 * being an entry from the partition key table), return 0
1121 * otherwise. Use the matching criteria for egress partition keys
1122 * specified in the OPAv1 spec., section 9.1l.7.
1123 */
1124static inline int egress_pkey_matches_entry(u16 pkey, u16 ent)
1125{
1126        u16 mkey = pkey & PKEY_LOW_15_MASK;
1127        u16 mentry = ent & PKEY_LOW_15_MASK;
1128
1129        if (mkey == mentry) {
1130                /*
1131                 * If pkey[15] is set (full partition member),
1132                 * is bit 15 in the corresponding table element
1133                 * clear (limited member)?
1134                 */
1135                if (pkey & PKEY_MEMBER_MASK)
1136                        return !!(ent & PKEY_MEMBER_MASK);
1137                return 1;
1138        }
1139        return 0;
1140}
1141
1142/**
1143 * egress_pkey_check - check P_KEY of a packet
1144 * @ppd:  Physical IB port data
1145 * @slid: SLID for packet
1146 * @bkey: PKEY for header
1147 * @sc5:  SC for packet
1148 * @s_pkey_index: It will be used for look up optimization for kernel contexts
1149 * only. If it is negative value, then it means user contexts is calling this
1150 * function.
1151 *
1152 * It checks if hdr's pkey is valid.
1153 *
1154 * Return: 0 on success, otherwise, 1
1155 */
1156int egress_pkey_check(struct hfi1_pportdata *ppd, u32 slid, u16 pkey,
1157                      u8 sc5, int8_t s_pkey_index)
1158{
1159        struct hfi1_devdata *dd;
1160        int i;
1161        int is_user_ctxt_mechanism = (s_pkey_index < 0);
1162
1163        if (!(ppd->part_enforce & HFI1_PART_ENFORCE_OUT))
1164                return 0;
1165
1166        /* If SC15, pkey[0:14] must be 0x7fff */
1167        if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
1168                goto bad;
1169
1170        /* Is the pkey = 0x0, or 0x8000? */
1171        if ((pkey & PKEY_LOW_15_MASK) == 0)
1172                goto bad;
1173
1174        /*
1175         * For the kernel contexts only, if a qp is passed into the function,
1176         * the most likely matching pkey has index qp->s_pkey_index
1177         */
1178        if (!is_user_ctxt_mechanism &&
1179            egress_pkey_matches_entry(pkey, ppd->pkeys[s_pkey_index])) {
1180                return 0;
1181        }
1182
1183        for (i = 0; i < MAX_PKEY_VALUES; i++) {
1184                if (egress_pkey_matches_entry(pkey, ppd->pkeys[i]))
1185                        return 0;
1186        }
1187bad:
1188        /*
1189         * For the user-context mechanism, the P_KEY check would only happen
1190         * once per SDMA request, not once per packet.  Therefore, there's no
1191         * need to increment the counter for the user-context mechanism.
1192         */
1193        if (!is_user_ctxt_mechanism) {
1194                incr_cntr64(&ppd->port_xmit_constraint_errors);
1195                dd = ppd->dd;
1196                if (!(dd->err_info_xmit_constraint.status &
1197                      OPA_EI_STATUS_SMASK)) {
1198                        dd->err_info_xmit_constraint.status |=
1199                                OPA_EI_STATUS_SMASK;
1200                        dd->err_info_xmit_constraint.slid = slid;
1201                        dd->err_info_xmit_constraint.pkey = pkey;
1202                }
1203        }
1204        return 1;
1205}
1206
1207/**
1208 * get_send_routine - choose an egress routine
1209 *
1210 * Choose an egress routine based on QP type
1211 * and size
1212 */
1213static inline send_routine get_send_routine(struct rvt_qp *qp,
1214                                            struct hfi1_pkt_state *ps)
1215{
1216        struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
1217        struct hfi1_qp_priv *priv = qp->priv;
1218        struct verbs_txreq *tx = ps->s_txreq;
1219
1220        if (unlikely(!(dd->flags & HFI1_HAS_SEND_DMA)))
1221                return dd->process_pio_send;
1222        switch (qp->ibqp.qp_type) {
1223        case IB_QPT_SMI:
1224                return dd->process_pio_send;
1225        case IB_QPT_GSI:
1226        case IB_QPT_UD:
1227                break;
1228        case IB_QPT_UC:
1229        case IB_QPT_RC:
1230                priv->s_running_pkt_size =
1231                        (tx->s_cur_size + priv->s_running_pkt_size) / 2;
1232                if (piothreshold &&
1233                    priv->s_running_pkt_size <= min(piothreshold, qp->pmtu) &&
1234                    (BIT(ps->opcode & OPMASK) & pio_opmask[ps->opcode >> 5]) &&
1235                    iowait_sdma_pending(&priv->s_iowait) == 0 &&
1236                    !sdma_txreq_built(&tx->txreq))
1237                        return dd->process_pio_send;
1238                break;
1239        default:
1240                break;
1241        }
1242        return dd->process_dma_send;
1243}
1244
1245/**
1246 * hfi1_verbs_send - send a packet
1247 * @qp: the QP to send on
1248 * @ps: the state of the packet to send
1249 *
1250 * Return zero if packet is sent or queued OK.
1251 * Return non-zero and clear qp->s_flags RVT_S_BUSY otherwise.
1252 */
1253int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
1254{
1255        struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
1256        struct hfi1_qp_priv *priv = qp->priv;
1257        struct ib_other_headers *ohdr = NULL;
1258        send_routine sr;
1259        int ret;
1260        u16 pkey;
1261        u32 slid;
1262        u8 l4 = 0;
1263
1264        /* locate the pkey within the headers */
1265        if (ps->s_txreq->phdr.hdr.hdr_type) {
1266                struct hfi1_16b_header *hdr = &ps->s_txreq->phdr.hdr.opah;
1267
1268                l4 = hfi1_16B_get_l4(hdr);
1269                if (l4 == OPA_16B_L4_IB_LOCAL)
1270                        ohdr = &hdr->u.oth;
1271                else if (l4 == OPA_16B_L4_IB_GLOBAL)
1272                        ohdr = &hdr->u.l.oth;
1273
1274                slid = hfi1_16B_get_slid(hdr);
1275                pkey = hfi1_16B_get_pkey(hdr);
1276        } else {
1277                struct ib_header *hdr = &ps->s_txreq->phdr.hdr.ibh;
1278                u8 lnh = ib_get_lnh(hdr);
1279
1280                if (lnh == HFI1_LRH_GRH)
1281                        ohdr = &hdr->u.l.oth;
1282                else
1283                        ohdr = &hdr->u.oth;
1284                slid = ib_get_slid(hdr);
1285                pkey = ib_bth_get_pkey(ohdr);
1286        }
1287
1288        if (likely(l4 != OPA_16B_L4_FM))
1289                ps->opcode = ib_bth_get_opcode(ohdr);
1290        else
1291                ps->opcode = IB_OPCODE_UD_SEND_ONLY;
1292
1293        sr = get_send_routine(qp, ps);
1294        ret = egress_pkey_check(dd->pport, slid, pkey,
1295                                priv->s_sc, qp->s_pkey_index);
1296        if (unlikely(ret)) {
1297                /*
1298                 * The value we are returning here does not get propagated to
1299                 * the verbs caller. Thus we need to complete the request with
1300                 * error otherwise the caller could be sitting waiting on the
1301                 * completion event. Only do this for PIO. SDMA has its own
1302                 * mechanism for handling the errors. So for SDMA we can just
1303                 * return.
1304                 */
1305                if (sr == dd->process_pio_send) {
1306                        unsigned long flags;
1307
1308                        hfi1_cdbg(PIO, "%s() Failed. Completing with err",
1309                                  __func__);
1310                        spin_lock_irqsave(&qp->s_lock, flags);
1311                        rvt_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR);
1312                        spin_unlock_irqrestore(&qp->s_lock, flags);
1313                }
1314                return -EINVAL;
1315        }
1316        if (sr == dd->process_dma_send && iowait_pio_pending(&priv->s_iowait))
1317                return pio_wait(qp,
1318                                ps->s_txreq->psc,
1319                                ps,
1320                                HFI1_S_WAIT_PIO_DRAIN);
1321        return sr(qp, ps, 0);
1322}
1323
1324/**
1325 * hfi1_fill_device_attr - Fill in rvt dev info device attributes.
1326 * @dd: the device data structure
1327 */
1328static void hfi1_fill_device_attr(struct hfi1_devdata *dd)
1329{
1330        struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
1331        u32 ver = dd->dc8051_ver;
1332
1333        memset(&rdi->dparms.props, 0, sizeof(rdi->dparms.props));
1334
1335        rdi->dparms.props.fw_ver = ((u64)(dc8051_ver_maj(ver)) << 32) |
1336                ((u64)(dc8051_ver_min(ver)) << 16) |
1337                (u64)dc8051_ver_patch(ver);
1338
1339        rdi->dparms.props.device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
1340                        IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
1341                        IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
1342                        IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE |
1343                        IB_DEVICE_MEM_MGT_EXTENSIONS |
1344                        IB_DEVICE_RDMA_NETDEV_OPA_VNIC;
1345        rdi->dparms.props.page_size_cap = PAGE_SIZE;
1346        rdi->dparms.props.vendor_id = dd->oui1 << 16 | dd->oui2 << 8 | dd->oui3;
1347        rdi->dparms.props.vendor_part_id = dd->pcidev->device;
1348        rdi->dparms.props.hw_ver = dd->minrev;
1349        rdi->dparms.props.sys_image_guid = ib_hfi1_sys_image_guid;
1350        rdi->dparms.props.max_mr_size = U64_MAX;
1351        rdi->dparms.props.max_fast_reg_page_list_len = UINT_MAX;
1352        rdi->dparms.props.max_qp = hfi1_max_qps;
1353        rdi->dparms.props.max_qp_wr =
1354                (hfi1_max_qp_wrs >= HFI1_QP_WQE_INVALID ?
1355                 HFI1_QP_WQE_INVALID - 1 : hfi1_max_qp_wrs);
1356        rdi->dparms.props.max_send_sge = hfi1_max_sges;
1357        rdi->dparms.props.max_recv_sge = hfi1_max_sges;
1358        rdi->dparms.props.max_sge_rd = hfi1_max_sges;
1359        rdi->dparms.props.max_cq = hfi1_max_cqs;
1360        rdi->dparms.props.max_ah = hfi1_max_ahs;
1361        rdi->dparms.props.max_cqe = hfi1_max_cqes;
1362        rdi->dparms.props.max_map_per_fmr = 32767;
1363        rdi->dparms.props.max_pd = hfi1_max_pds;
1364        rdi->dparms.props.max_qp_rd_atom = HFI1_MAX_RDMA_ATOMIC;
1365        rdi->dparms.props.max_qp_init_rd_atom = 255;
1366        rdi->dparms.props.max_srq = hfi1_max_srqs;
1367        rdi->dparms.props.max_srq_wr = hfi1_max_srq_wrs;
1368        rdi->dparms.props.max_srq_sge = hfi1_max_srq_sges;
1369        rdi->dparms.props.atomic_cap = IB_ATOMIC_GLOB;
1370        rdi->dparms.props.max_pkeys = hfi1_get_npkeys(dd);
1371        rdi->dparms.props.max_mcast_grp = hfi1_max_mcast_grps;
1372        rdi->dparms.props.max_mcast_qp_attach = hfi1_max_mcast_qp_attached;
1373        rdi->dparms.props.max_total_mcast_qp_attach =
1374                                        rdi->dparms.props.max_mcast_qp_attach *
1375                                        rdi->dparms.props.max_mcast_grp;
1376}
1377
1378static inline u16 opa_speed_to_ib(u16 in)
1379{
1380        u16 out = 0;
1381
1382        if (in & OPA_LINK_SPEED_25G)
1383                out |= IB_SPEED_EDR;
1384        if (in & OPA_LINK_SPEED_12_5G)
1385                out |= IB_SPEED_FDR;
1386
1387        return out;
1388}
1389
1390/*
1391 * Convert a single OPA link width (no multiple flags) to an IB value.
1392 * A zero OPA link width means link down, which means the IB width value
1393 * is a don't care.
1394 */
1395static inline u16 opa_width_to_ib(u16 in)
1396{
1397        switch (in) {
1398        case OPA_LINK_WIDTH_1X:
1399        /* map 2x and 3x to 1x as they don't exist in IB */
1400        case OPA_LINK_WIDTH_2X:
1401        case OPA_LINK_WIDTH_3X:
1402                return IB_WIDTH_1X;
1403        default: /* link down or unknown, return our largest width */
1404        case OPA_LINK_WIDTH_4X:
1405                return IB_WIDTH_4X;
1406        }
1407}
1408
1409static int query_port(struct rvt_dev_info *rdi, u8 port_num,
1410                      struct ib_port_attr *props)
1411{
1412        struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
1413        struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
1414        struct hfi1_pportdata *ppd = &dd->pport[port_num - 1];
1415        u32 lid = ppd->lid;
1416
1417        /* props being zeroed by the caller, avoid zeroing it here */
1418        props->lid = lid ? lid : 0;
1419        props->lmc = ppd->lmc;
1420        /* OPA logical states match IB logical states */
1421        props->state = driver_lstate(ppd);
1422        props->phys_state = driver_pstate(ppd);
1423        props->gid_tbl_len = HFI1_GUIDS_PER_PORT;
1424        props->active_width = (u8)opa_width_to_ib(ppd->link_width_active);
1425        /* see rate_show() in ib core/sysfs.c */
1426        props->active_speed = (u8)opa_speed_to_ib(ppd->link_speed_active);
1427        props->max_vl_num = ppd->vls_supported;
1428
1429        /* Once we are a "first class" citizen and have added the OPA MTUs to
1430         * the core we can advertise the larger MTU enum to the ULPs, for now
1431         * advertise only 4K.
1432         *
1433         * Those applications which are either OPA aware or pass the MTU enum
1434         * from the Path Records to us will get the new 8k MTU.  Those that
1435         * attempt to process the MTU enum may fail in various ways.
1436         */
1437        props->max_mtu = mtu_to_enum((!valid_ib_mtu(hfi1_max_mtu) ?
1438                                      4096 : hfi1_max_mtu), IB_MTU_4096);
1439        props->active_mtu = !valid_ib_mtu(ppd->ibmtu) ? props->max_mtu :
1440                mtu_to_enum(ppd->ibmtu, IB_MTU_4096);
1441
1442        return 0;
1443}
1444
1445static int modify_device(struct ib_device *device,
1446                         int device_modify_mask,
1447                         struct ib_device_modify *device_modify)
1448{
1449        struct hfi1_devdata *dd = dd_from_ibdev(device);
1450        unsigned i;
1451        int ret;
1452
1453        if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID |
1454                                   IB_DEVICE_MODIFY_NODE_DESC)) {
1455                ret = -EOPNOTSUPP;
1456                goto bail;
1457        }
1458
1459        if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC) {
1460                memcpy(device->node_desc, device_modify->node_desc,
1461                       IB_DEVICE_NODE_DESC_MAX);
1462                for (i = 0; i < dd->num_pports; i++) {
1463                        struct hfi1_ibport *ibp = &dd->pport[i].ibport_data;
1464
1465                        hfi1_node_desc_chg(ibp);
1466                }
1467        }
1468
1469        if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) {
1470                ib_hfi1_sys_image_guid =
1471                        cpu_to_be64(device_modify->sys_image_guid);
1472                for (i = 0; i < dd->num_pports; i++) {
1473                        struct hfi1_ibport *ibp = &dd->pport[i].ibport_data;
1474
1475                        hfi1_sys_guid_chg(ibp);
1476                }
1477        }
1478
1479        ret = 0;
1480
1481bail:
1482        return ret;
1483}
1484
1485static int shut_down_port(struct rvt_dev_info *rdi, u8 port_num)
1486{
1487        struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
1488        struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
1489        struct hfi1_pportdata *ppd = &dd->pport[port_num - 1];
1490        int ret;
1491
1492        set_link_down_reason(ppd, OPA_LINKDOWN_REASON_UNKNOWN, 0,
1493                             OPA_LINKDOWN_REASON_UNKNOWN);
1494        ret = set_link_state(ppd, HLS_DN_DOWNDEF);
1495        return ret;
1496}
1497
1498static int hfi1_get_guid_be(struct rvt_dev_info *rdi, struct rvt_ibport *rvp,
1499                            int guid_index, __be64 *guid)
1500{
1501        struct hfi1_ibport *ibp = container_of(rvp, struct hfi1_ibport, rvp);
1502
1503        if (guid_index >= HFI1_GUIDS_PER_PORT)
1504                return -EINVAL;
1505
1506        *guid = get_sguid(ibp, guid_index);
1507        return 0;
1508}
1509
1510/*
1511 * convert ah port,sl to sc
1512 */
1513u8 ah_to_sc(struct ib_device *ibdev, struct rdma_ah_attr *ah)
1514{
1515        struct hfi1_ibport *ibp = to_iport(ibdev, rdma_ah_get_port_num(ah));
1516
1517        return ibp->sl_to_sc[rdma_ah_get_sl(ah)];
1518}
1519
1520static int hfi1_check_ah(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr)
1521{
1522        struct hfi1_ibport *ibp;
1523        struct hfi1_pportdata *ppd;
1524        struct hfi1_devdata *dd;
1525        u8 sc5;
1526        u8 sl;
1527
1528        if (hfi1_check_mcast(rdma_ah_get_dlid(ah_attr)) &&
1529            !(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH))
1530                return -EINVAL;
1531
1532        /* test the mapping for validity */
1533        ibp = to_iport(ibdev, rdma_ah_get_port_num(ah_attr));
1534        ppd = ppd_from_ibp(ibp);
1535        dd = dd_from_ppd(ppd);
1536
1537        sl = rdma_ah_get_sl(ah_attr);
1538        if (sl >= ARRAY_SIZE(ibp->sl_to_sc))
1539                return -EINVAL;
1540        sl = array_index_nospec(sl, ARRAY_SIZE(ibp->sl_to_sc));
1541
1542        sc5 = ibp->sl_to_sc[sl];
1543        if (sc_to_vlt(dd, sc5) > num_vls && sc_to_vlt(dd, sc5) != 0xf)
1544                return -EINVAL;
1545        return 0;
1546}
1547
1548static void hfi1_notify_new_ah(struct ib_device *ibdev,
1549                               struct rdma_ah_attr *ah_attr,
1550                               struct rvt_ah *ah)
1551{
1552        struct hfi1_ibport *ibp;
1553        struct hfi1_pportdata *ppd;
1554        struct hfi1_devdata *dd;
1555        u8 sc5;
1556        struct rdma_ah_attr *attr = &ah->attr;
1557
1558        /*
1559         * Do not trust reading anything from rvt_ah at this point as it is not
1560         * done being setup. We can however modify things which we need to set.
1561         */
1562
1563        ibp = to_iport(ibdev, rdma_ah_get_port_num(ah_attr));
1564        ppd = ppd_from_ibp(ibp);
1565        sc5 = ibp->sl_to_sc[rdma_ah_get_sl(&ah->attr)];
1566        hfi1_update_ah_attr(ibdev, attr);
1567        hfi1_make_opa_lid(attr);
1568        dd = dd_from_ppd(ppd);
1569        ah->vl = sc_to_vlt(dd, sc5);
1570        if (ah->vl < num_vls || ah->vl == 15)
1571                ah->log_pmtu = ilog2(dd->vld[ah->vl].mtu);
1572}
1573
1574/**
1575 * hfi1_get_npkeys - return the size of the PKEY table for context 0
1576 * @dd: the hfi1_ib device
1577 */
1578unsigned hfi1_get_npkeys(struct hfi1_devdata *dd)
1579{
1580        return ARRAY_SIZE(dd->pport[0].pkeys);
1581}
1582
1583static void init_ibport(struct hfi1_pportdata *ppd)
1584{
1585        struct hfi1_ibport *ibp = &ppd->ibport_data;
1586        size_t sz = ARRAY_SIZE(ibp->sl_to_sc);
1587        int i;
1588
1589        for (i = 0; i < sz; i++) {
1590                ibp->sl_to_sc[i] = i;
1591                ibp->sc_to_sl[i] = i;
1592        }
1593
1594        for (i = 0; i < RVT_MAX_TRAP_LISTS ; i++)
1595                INIT_LIST_HEAD(&ibp->rvp.trap_lists[i].list);
1596        timer_setup(&ibp->rvp.trap_timer, hfi1_handle_trap_timer, 0);
1597
1598        spin_lock_init(&ibp->rvp.lock);
1599        /* Set the prefix to the default value (see ch. 4.1.1) */
1600        ibp->rvp.gid_prefix = IB_DEFAULT_GID_PREFIX;
1601        ibp->rvp.sm_lid = 0;
1602        /*
1603         * Below should only set bits defined in OPA PortInfo.CapabilityMask
1604         * and PortInfo.CapabilityMask3
1605         */
1606        ibp->rvp.port_cap_flags = IB_PORT_AUTO_MIGR_SUP |
1607                IB_PORT_CAP_MASK_NOTICE_SUP;
1608        ibp->rvp.port_cap3_flags = OPA_CAP_MASK3_IsSharedSpaceSupported;
1609        ibp->rvp.pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA;
1610        ibp->rvp.pma_counter_select[1] = IB_PMA_PORT_RCV_DATA;
1611        ibp->rvp.pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS;
1612        ibp->rvp.pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS;
1613        ibp->rvp.pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT;
1614
1615        RCU_INIT_POINTER(ibp->rvp.qp[0], NULL);
1616        RCU_INIT_POINTER(ibp->rvp.qp[1], NULL);
1617}
1618
1619static void hfi1_get_dev_fw_str(struct ib_device *ibdev, char *str)
1620{
1621        struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
1622        struct hfi1_ibdev *dev = dev_from_rdi(rdi);
1623        u32 ver = dd_from_dev(dev)->dc8051_ver;
1624
1625        snprintf(str, IB_FW_VERSION_NAME_MAX, "%u.%u.%u", dc8051_ver_maj(ver),
1626                 dc8051_ver_min(ver), dc8051_ver_patch(ver));
1627}
1628
1629static const char * const driver_cntr_names[] = {
1630        /* must be element 0*/
1631        "DRIVER_KernIntr",
1632        "DRIVER_ErrorIntr",
1633        "DRIVER_Tx_Errs",
1634        "DRIVER_Rcv_Errs",
1635        "DRIVER_HW_Errs",
1636        "DRIVER_NoPIOBufs",
1637        "DRIVER_CtxtsOpen",
1638        "DRIVER_RcvLen_Errs",
1639        "DRIVER_EgrBufFull",
1640        "DRIVER_EgrHdrFull"
1641};
1642
1643static DEFINE_MUTEX(cntr_names_lock); /* protects the *_cntr_names bufers */
1644static const char **dev_cntr_names;
1645static const char **port_cntr_names;
1646int num_driver_cntrs = ARRAY_SIZE(driver_cntr_names);
1647static int num_dev_cntrs;
1648static int num_port_cntrs;
1649static int cntr_names_initialized;
1650
1651/*
1652 * Convert a list of names separated by '\n' into an array of NULL terminated
1653 * strings. Optionally some entries can be reserved in the array to hold extra
1654 * external strings.
1655 */
1656static int init_cntr_names(const char *names_in,
1657                           const size_t names_len,
1658                           int num_extra_names,
1659                           int *num_cntrs,
1660                           const char ***cntr_names)
1661{
1662        char *names_out, *p, **q;
1663        int i, n;
1664
1665        n = 0;
1666        for (i = 0; i < names_len; i++)
1667                if (names_in[i] == '\n')
1668                        n++;
1669
1670        names_out = kmalloc((n + num_extra_names) * sizeof(char *) + names_len,
1671                            GFP_KERNEL);
1672        if (!names_out) {
1673                *num_cntrs = 0;
1674                *cntr_names = NULL;
1675                return -ENOMEM;
1676        }
1677
1678        p = names_out + (n + num_extra_names) * sizeof(char *);
1679        memcpy(p, names_in, names_len);
1680
1681        q = (char **)names_out;
1682        for (i = 0; i < n; i++) {
1683                q[i] = p;
1684                p = strchr(p, '\n');
1685                *p++ = '\0';
1686        }
1687
1688        *num_cntrs = n;
1689        *cntr_names = (const char **)names_out;
1690        return 0;
1691}
1692
1693static struct rdma_hw_stats *alloc_hw_stats(struct ib_device *ibdev,
1694                                            u8 port_num)
1695{
1696        int i, err;
1697
1698        mutex_lock(&cntr_names_lock);
1699        if (!cntr_names_initialized) {
1700                struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
1701
1702                err = init_cntr_names(dd->cntrnames,
1703                                      dd->cntrnameslen,
1704                                      num_driver_cntrs,
1705                                      &num_dev_cntrs,
1706                                      &dev_cntr_names);
1707                if (err) {
1708                        mutex_unlock(&cntr_names_lock);
1709                        return NULL;
1710                }
1711
1712                for (i = 0; i < num_driver_cntrs; i++)
1713                        dev_cntr_names[num_dev_cntrs + i] =
1714                                driver_cntr_names[i];
1715
1716                err = init_cntr_names(dd->portcntrnames,
1717                                      dd->portcntrnameslen,
1718                                      0,
1719                                      &num_port_cntrs,
1720                                      &port_cntr_names);
1721                if (err) {
1722                        kfree(dev_cntr_names);
1723                        dev_cntr_names = NULL;
1724                        mutex_unlock(&cntr_names_lock);
1725                        return NULL;
1726                }
1727                cntr_names_initialized = 1;
1728        }
1729        mutex_unlock(&cntr_names_lock);
1730
1731        if (!port_num)
1732                return rdma_alloc_hw_stats_struct(
1733                                dev_cntr_names,
1734                                num_dev_cntrs + num_driver_cntrs,
1735                                RDMA_HW_STATS_DEFAULT_LIFESPAN);
1736        else
1737                return rdma_alloc_hw_stats_struct(
1738                                port_cntr_names,
1739                                num_port_cntrs,
1740                                RDMA_HW_STATS_DEFAULT_LIFESPAN);
1741}
1742
1743static u64 hfi1_sps_ints(void)
1744{
1745        unsigned long index, flags;
1746        struct hfi1_devdata *dd;
1747        u64 sps_ints = 0;
1748
1749        xa_lock_irqsave(&hfi1_dev_table, flags);
1750        xa_for_each(&hfi1_dev_table, index, dd) {
1751                sps_ints += get_all_cpu_total(dd->int_counter);
1752        }
1753        xa_unlock_irqrestore(&hfi1_dev_table, flags);
1754        return sps_ints;
1755}
1756
1757static int get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats,
1758                        u8 port, int index)
1759{
1760        u64 *values;
1761        int count;
1762
1763        if (!port) {
1764                u64 *stats = (u64 *)&hfi1_stats;
1765                int i;
1766
1767                hfi1_read_cntrs(dd_from_ibdev(ibdev), NULL, &values);
1768                values[num_dev_cntrs] = hfi1_sps_ints();
1769                for (i = 1; i < num_driver_cntrs; i++)
1770                        values[num_dev_cntrs + i] = stats[i];
1771                count = num_dev_cntrs + num_driver_cntrs;
1772        } else {
1773                struct hfi1_ibport *ibp = to_iport(ibdev, port);
1774
1775                hfi1_read_portcntrs(ppd_from_ibp(ibp), NULL, &values);
1776                count = num_port_cntrs;
1777        }
1778
1779        memcpy(stats->value, values, count * sizeof(u64));
1780        return count;
1781}
1782
1783static const struct ib_device_ops hfi1_dev_ops = {
1784        .owner = THIS_MODULE,
1785        .driver_id = RDMA_DRIVER_HFI1,
1786
1787        .alloc_hw_stats = alloc_hw_stats,
1788        .alloc_rdma_netdev = hfi1_vnic_alloc_rn,
1789        .get_dev_fw_str = hfi1_get_dev_fw_str,
1790        .get_hw_stats = get_hw_stats,
1791        .init_port = hfi1_create_port_files,
1792        .modify_device = modify_device,
1793        /* keep process mad in the driver */
1794        .process_mad = hfi1_process_mad,
1795};
1796
1797/**
1798 * hfi1_register_ib_device - register our device with the infiniband core
1799 * @dd: the device data structure
1800 * Return 0 if successful, errno if unsuccessful.
1801 */
1802int hfi1_register_ib_device(struct hfi1_devdata *dd)
1803{
1804        struct hfi1_ibdev *dev = &dd->verbs_dev;
1805        struct ib_device *ibdev = &dev->rdi.ibdev;
1806        struct hfi1_pportdata *ppd = dd->pport;
1807        struct hfi1_ibport *ibp = &ppd->ibport_data;
1808        unsigned i;
1809        int ret;
1810
1811        for (i = 0; i < dd->num_pports; i++)
1812                init_ibport(ppd + i);
1813
1814        /* Only need to initialize non-zero fields. */
1815
1816        timer_setup(&dev->mem_timer, mem_timer, 0);
1817
1818        seqlock_init(&dev->iowait_lock);
1819        seqlock_init(&dev->txwait_lock);
1820        INIT_LIST_HEAD(&dev->txwait);
1821        INIT_LIST_HEAD(&dev->memwait);
1822
1823        ret = verbs_txreq_init(dev);
1824        if (ret)
1825                goto err_verbs_txreq;
1826
1827        /* Use first-port GUID as node guid */
1828        ibdev->node_guid = get_sguid(ibp, HFI1_PORT_GUID_INDEX);
1829
1830        /*
1831         * The system image GUID is supposed to be the same for all
1832         * HFIs in a single system but since there can be other
1833         * device types in the system, we can't be sure this is unique.
1834         */
1835        if (!ib_hfi1_sys_image_guid)
1836                ib_hfi1_sys_image_guid = ibdev->node_guid;
1837        ibdev->phys_port_cnt = dd->num_pports;
1838        ibdev->dev.parent = &dd->pcidev->dev;
1839
1840        ib_set_device_ops(ibdev, &hfi1_dev_ops);
1841
1842        strlcpy(ibdev->node_desc, init_utsname()->nodename,
1843                sizeof(ibdev->node_desc));
1844
1845        /*
1846         * Fill in rvt info object.
1847         */
1848        dd->verbs_dev.rdi.driver_f.get_pci_dev = get_pci_dev;
1849        dd->verbs_dev.rdi.driver_f.check_ah = hfi1_check_ah;
1850        dd->verbs_dev.rdi.driver_f.notify_new_ah = hfi1_notify_new_ah;
1851        dd->verbs_dev.rdi.driver_f.get_guid_be = hfi1_get_guid_be;
1852        dd->verbs_dev.rdi.driver_f.query_port_state = query_port;
1853        dd->verbs_dev.rdi.driver_f.shut_down_port = shut_down_port;
1854        dd->verbs_dev.rdi.driver_f.cap_mask_chg = hfi1_cap_mask_chg;
1855        /*
1856         * Fill in rvt info device attributes.
1857         */
1858        hfi1_fill_device_attr(dd);
1859
1860        /* queue pair */
1861        dd->verbs_dev.rdi.dparms.qp_table_size = hfi1_qp_table_size;
1862        dd->verbs_dev.rdi.dparms.qpn_start = 0;
1863        dd->verbs_dev.rdi.dparms.qpn_inc = 1;
1864        dd->verbs_dev.rdi.dparms.qos_shift = dd->qos_shift;
1865        dd->verbs_dev.rdi.dparms.qpn_res_start = kdeth_qp << 16;
1866        dd->verbs_dev.rdi.dparms.qpn_res_end =
1867        dd->verbs_dev.rdi.dparms.qpn_res_start + 65535;
1868        dd->verbs_dev.rdi.dparms.max_rdma_atomic = HFI1_MAX_RDMA_ATOMIC;
1869        dd->verbs_dev.rdi.dparms.psn_mask = PSN_MASK;
1870        dd->verbs_dev.rdi.dparms.psn_shift = PSN_SHIFT;
1871        dd->verbs_dev.rdi.dparms.psn_modify_mask = PSN_MODIFY_MASK;
1872        dd->verbs_dev.rdi.dparms.core_cap_flags = RDMA_CORE_PORT_INTEL_OPA |
1873                                                RDMA_CORE_CAP_OPA_AH;
1874        dd->verbs_dev.rdi.dparms.max_mad_size = OPA_MGMT_MAD_SIZE;
1875
1876        dd->verbs_dev.rdi.driver_f.qp_priv_alloc = qp_priv_alloc;
1877        dd->verbs_dev.rdi.driver_f.qp_priv_init = hfi1_qp_priv_init;
1878        dd->verbs_dev.rdi.driver_f.qp_priv_free = qp_priv_free;
1879        dd->verbs_dev.rdi.driver_f.free_all_qps = free_all_qps;
1880        dd->verbs_dev.rdi.driver_f.notify_qp_reset = notify_qp_reset;
1881        dd->verbs_dev.rdi.driver_f.do_send = hfi1_do_send_from_rvt;
1882        dd->verbs_dev.rdi.driver_f.schedule_send = hfi1_schedule_send;
1883        dd->verbs_dev.rdi.driver_f.schedule_send_no_lock = _hfi1_schedule_send;
1884        dd->verbs_dev.rdi.driver_f.get_pmtu_from_attr = get_pmtu_from_attr;
1885        dd->verbs_dev.rdi.driver_f.notify_error_qp = notify_error_qp;
1886        dd->verbs_dev.rdi.driver_f.flush_qp_waiters = flush_qp_waiters;
1887        dd->verbs_dev.rdi.driver_f.stop_send_queue = stop_send_queue;
1888        dd->verbs_dev.rdi.driver_f.quiesce_qp = quiesce_qp;
1889        dd->verbs_dev.rdi.driver_f.notify_error_qp = notify_error_qp;
1890        dd->verbs_dev.rdi.driver_f.mtu_from_qp = mtu_from_qp;
1891        dd->verbs_dev.rdi.driver_f.mtu_to_path_mtu = mtu_to_path_mtu;
1892        dd->verbs_dev.rdi.driver_f.check_modify_qp = hfi1_check_modify_qp;
1893        dd->verbs_dev.rdi.driver_f.modify_qp = hfi1_modify_qp;
1894        dd->verbs_dev.rdi.driver_f.notify_restart_rc = hfi1_restart_rc;
1895        dd->verbs_dev.rdi.driver_f.setup_wqe = hfi1_setup_wqe;
1896        dd->verbs_dev.rdi.driver_f.comp_vect_cpu_lookup =
1897                                                hfi1_comp_vect_mappings_lookup;
1898
1899        /* completeion queue */
1900        dd->verbs_dev.rdi.ibdev.num_comp_vectors = dd->comp_vect_possible_cpus;
1901        dd->verbs_dev.rdi.dparms.node = dd->node;
1902
1903        /* misc settings */
1904        dd->verbs_dev.rdi.flags = 0; /* Let rdmavt handle it all */
1905        dd->verbs_dev.rdi.dparms.lkey_table_size = hfi1_lkey_table_size;
1906        dd->verbs_dev.rdi.dparms.nports = dd->num_pports;
1907        dd->verbs_dev.rdi.dparms.npkeys = hfi1_get_npkeys(dd);
1908        dd->verbs_dev.rdi.dparms.sge_copy_mode = sge_copy_mode;
1909        dd->verbs_dev.rdi.dparms.wss_threshold = wss_threshold;
1910        dd->verbs_dev.rdi.dparms.wss_clean_period = wss_clean_period;
1911        dd->verbs_dev.rdi.dparms.reserved_operations = 1;
1912        dd->verbs_dev.rdi.dparms.extra_rdma_atomic = HFI1_TID_RDMA_WRITE_CNT;
1913
1914        /* post send table */
1915        dd->verbs_dev.rdi.post_parms = hfi1_post_parms;
1916
1917        /* opcode translation table */
1918        dd->verbs_dev.rdi.wc_opcode = ib_hfi1_wc_opcode;
1919
1920        ppd = dd->pport;
1921        for (i = 0; i < dd->num_pports; i++, ppd++)
1922                rvt_init_port(&dd->verbs_dev.rdi,
1923                              &ppd->ibport_data.rvp,
1924                              i,
1925                              ppd->pkeys);
1926
1927        rdma_set_device_sysfs_group(&dd->verbs_dev.rdi.ibdev,
1928                                    &ib_hfi1_attr_group);
1929
1930        ret = rvt_register_device(&dd->verbs_dev.rdi);
1931        if (ret)
1932                goto err_verbs_txreq;
1933
1934        ret = hfi1_verbs_register_sysfs(dd);
1935        if (ret)
1936                goto err_class;
1937
1938        return ret;
1939
1940err_class:
1941        rvt_unregister_device(&dd->verbs_dev.rdi);
1942err_verbs_txreq:
1943        verbs_txreq_exit(dev);
1944        dd_dev_err(dd, "cannot register verbs: %d!\n", -ret);
1945        return ret;
1946}
1947
1948void hfi1_unregister_ib_device(struct hfi1_devdata *dd)
1949{
1950        struct hfi1_ibdev *dev = &dd->verbs_dev;
1951
1952        hfi1_verbs_unregister_sysfs(dd);
1953
1954        rvt_unregister_device(&dd->verbs_dev.rdi);
1955
1956        if (!list_empty(&dev->txwait))
1957                dd_dev_err(dd, "txwait list not empty!\n");
1958        if (!list_empty(&dev->memwait))
1959                dd_dev_err(dd, "memwait list not empty!\n");
1960
1961        del_timer_sync(&dev->mem_timer);
1962        verbs_txreq_exit(dev);
1963
1964        mutex_lock(&cntr_names_lock);
1965        kfree(dev_cntr_names);
1966        kfree(port_cntr_names);
1967        dev_cntr_names = NULL;
1968        port_cntr_names = NULL;
1969        cntr_names_initialized = 0;
1970        mutex_unlock(&cntr_names_lock);
1971}
1972
1973void hfi1_cnp_rcv(struct hfi1_packet *packet)
1974{
1975        struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd);
1976        struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
1977        struct ib_header *hdr = packet->hdr;
1978        struct rvt_qp *qp = packet->qp;
1979        u32 lqpn, rqpn = 0;
1980        u16 rlid = 0;
1981        u8 sl, sc5, svc_type;
1982
1983        switch (packet->qp->ibqp.qp_type) {
1984        case IB_QPT_UC:
1985                rlid = rdma_ah_get_dlid(&qp->remote_ah_attr);
1986                rqpn = qp->remote_qpn;
1987                svc_type = IB_CC_SVCTYPE_UC;
1988                break;
1989        case IB_QPT_RC:
1990                rlid = rdma_ah_get_dlid(&qp->remote_ah_attr);
1991                rqpn = qp->remote_qpn;
1992                svc_type = IB_CC_SVCTYPE_RC;
1993                break;
1994        case IB_QPT_SMI:
1995        case IB_QPT_GSI:
1996        case IB_QPT_UD:
1997                svc_type = IB_CC_SVCTYPE_UD;
1998                break;
1999        default:
2000                ibp->rvp.n_pkt_drops++;
2001                return;
2002        }
2003
2004        sc5 = hfi1_9B_get_sc5(hdr, packet->rhf);
2005        sl = ibp->sc_to_sl[sc5];
2006        lqpn = qp->ibqp.qp_num;
2007
2008        process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type);
2009}
2010