linux/drivers/infiniband/hw/hfi1/verbs.c
<<
>>
Prefs
   1/*
   2 * Copyright(c) 2015 - 2018 Intel Corporation.
   3 *
   4 * This file is provided under a dual BSD/GPLv2 license.  When using or
   5 * redistributing this file, you may do so under either license.
   6 *
   7 * GPL LICENSE SUMMARY
   8 *
   9 * This program is free software; you can redistribute it and/or modify
  10 * it under the terms of version 2 of the GNU General Public License as
  11 * published by the Free Software Foundation.
  12 *
  13 * This program is distributed in the hope that it will be useful, but
  14 * WITHOUT ANY WARRANTY; without even the implied warranty of
  15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16 * General Public License for more details.
  17 *
  18 * BSD LICENSE
  19 *
  20 * Redistribution and use in source and binary forms, with or without
  21 * modification, are permitted provided that the following conditions
  22 * are met:
  23 *
  24 *  - Redistributions of source code must retain the above copyright
  25 *    notice, this list of conditions and the following disclaimer.
  26 *  - Redistributions in binary form must reproduce the above copyright
  27 *    notice, this list of conditions and the following disclaimer in
  28 *    the documentation and/or other materials provided with the
  29 *    distribution.
  30 *  - Neither the name of Intel Corporation nor the names of its
  31 *    contributors may be used to endorse or promote products derived
  32 *    from this software without specific prior written permission.
  33 *
  34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  45 *
  46 */
  47
  48#include <rdma/ib_mad.h>
  49#include <rdma/ib_user_verbs.h>
  50#include <linux/io.h>
  51#include <linux/module.h>
  52#include <linux/utsname.h>
  53#include <linux/rculist.h>
  54#include <linux/mm.h>
  55#include <linux/vmalloc.h>
  56#include <rdma/opa_addr.h>
  57#include <linux/nospec.h>
  58
  59#include "hfi.h"
  60#include "common.h"
  61#include "device.h"
  62#include "trace.h"
  63#include "qp.h"
  64#include "verbs_txreq.h"
  65#include "debugfs.h"
  66#include "vnic.h"
  67#include "fault.h"
  68#include "affinity.h"
  69
  70static unsigned int hfi1_lkey_table_size = 16;
  71module_param_named(lkey_table_size, hfi1_lkey_table_size, uint,
  72                   S_IRUGO);
  73MODULE_PARM_DESC(lkey_table_size,
  74                 "LKEY table size in bits (2^n, 1 <= n <= 23)");
  75
  76static unsigned int hfi1_max_pds = 0xFFFF;
  77module_param_named(max_pds, hfi1_max_pds, uint, S_IRUGO);
  78MODULE_PARM_DESC(max_pds,
  79                 "Maximum number of protection domains to support");
  80
  81static unsigned int hfi1_max_ahs = 0xFFFF;
  82module_param_named(max_ahs, hfi1_max_ahs, uint, S_IRUGO);
  83MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support");
  84
  85unsigned int hfi1_max_cqes = 0x2FFFFF;
  86module_param_named(max_cqes, hfi1_max_cqes, uint, S_IRUGO);
  87MODULE_PARM_DESC(max_cqes,
  88                 "Maximum number of completion queue entries to support");
  89
  90unsigned int hfi1_max_cqs = 0x1FFFF;
  91module_param_named(max_cqs, hfi1_max_cqs, uint, S_IRUGO);
  92MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support");
  93
  94unsigned int hfi1_max_qp_wrs = 0x3FFF;
  95module_param_named(max_qp_wrs, hfi1_max_qp_wrs, uint, S_IRUGO);
  96MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support");
  97
  98unsigned int hfi1_max_qps = 32768;
  99module_param_named(max_qps, hfi1_max_qps, uint, S_IRUGO);
 100MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support");
 101
 102unsigned int hfi1_max_sges = 0x60;
 103module_param_named(max_sges, hfi1_max_sges, uint, S_IRUGO);
 104MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support");
 105
 106unsigned int hfi1_max_mcast_grps = 16384;
 107module_param_named(max_mcast_grps, hfi1_max_mcast_grps, uint, S_IRUGO);
 108MODULE_PARM_DESC(max_mcast_grps,
 109                 "Maximum number of multicast groups to support");
 110
 111unsigned int hfi1_max_mcast_qp_attached = 16;
 112module_param_named(max_mcast_qp_attached, hfi1_max_mcast_qp_attached,
 113                   uint, S_IRUGO);
 114MODULE_PARM_DESC(max_mcast_qp_attached,
 115                 "Maximum number of attached QPs to support");
 116
 117unsigned int hfi1_max_srqs = 1024;
 118module_param_named(max_srqs, hfi1_max_srqs, uint, S_IRUGO);
 119MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support");
 120
 121unsigned int hfi1_max_srq_sges = 128;
 122module_param_named(max_srq_sges, hfi1_max_srq_sges, uint, S_IRUGO);
 123MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support");
 124
 125unsigned int hfi1_max_srq_wrs = 0x1FFFF;
 126module_param_named(max_srq_wrs, hfi1_max_srq_wrs, uint, S_IRUGO);
 127MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support");
 128
 129unsigned short piothreshold = 256;
 130module_param(piothreshold, ushort, S_IRUGO);
 131MODULE_PARM_DESC(piothreshold, "size used to determine sdma vs. pio");
 132
 133static unsigned int sge_copy_mode;
 134module_param(sge_copy_mode, uint, S_IRUGO);
 135MODULE_PARM_DESC(sge_copy_mode,
 136                 "Verbs copy mode: 0 use memcpy, 1 use cacheless copy, 2 adapt based on WSS");
 137
 138static void verbs_sdma_complete(
 139        struct sdma_txreq *cookie,
 140        int status);
 141
 142static int pio_wait(struct rvt_qp *qp,
 143                    struct send_context *sc,
 144                    struct hfi1_pkt_state *ps,
 145                    u32 flag);
 146
 147/* Length of buffer to create verbs txreq cache name */
 148#define TXREQ_NAME_LEN 24
 149
 150static uint wss_threshold = 80;
 151module_param(wss_threshold, uint, S_IRUGO);
 152MODULE_PARM_DESC(wss_threshold, "Percentage (1-100) of LLC to use as a threshold for a cacheless copy");
 153static uint wss_clean_period = 256;
 154module_param(wss_clean_period, uint, S_IRUGO);
 155MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the page copy table is cleaned");
 156
 157/*
 158 * Translate ib_wr_opcode into ib_wc_opcode.
 159 */
 160const enum ib_wc_opcode ib_hfi1_wc_opcode[] = {
 161        [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
 162        [IB_WR_TID_RDMA_WRITE] = IB_WC_RDMA_WRITE,
 163        [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
 164        [IB_WR_SEND] = IB_WC_SEND,
 165        [IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
 166        [IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
 167        [IB_WR_TID_RDMA_READ] = IB_WC_RDMA_READ,
 168        [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
 169        [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD,
 170        [IB_WR_SEND_WITH_INV] = IB_WC_SEND,
 171        [IB_WR_LOCAL_INV] = IB_WC_LOCAL_INV,
 172        [IB_WR_REG_MR] = IB_WC_REG_MR
 173};
 174
 175/*
 176 * Length of header by opcode, 0 --> not supported
 177 */
 178const u8 hdr_len_by_opcode[256] = {
 179        /* RC */
 180        [IB_OPCODE_RC_SEND_FIRST]                     = 12 + 8,
 181        [IB_OPCODE_RC_SEND_MIDDLE]                    = 12 + 8,
 182        [IB_OPCODE_RC_SEND_LAST]                      = 12 + 8,
 183        [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE]       = 12 + 8 + 4,
 184        [IB_OPCODE_RC_SEND_ONLY]                      = 12 + 8,
 185        [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 4,
 186        [IB_OPCODE_RC_RDMA_WRITE_FIRST]               = 12 + 8 + 16,
 187        [IB_OPCODE_RC_RDMA_WRITE_MIDDLE]              = 12 + 8,
 188        [IB_OPCODE_RC_RDMA_WRITE_LAST]                = 12 + 8,
 189        [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
 190        [IB_OPCODE_RC_RDMA_WRITE_ONLY]                = 12 + 8 + 16,
 191        [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20,
 192        [IB_OPCODE_RC_RDMA_READ_REQUEST]              = 12 + 8 + 16,
 193        [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST]       = 12 + 8 + 4,
 194        [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE]      = 12 + 8,
 195        [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST]        = 12 + 8 + 4,
 196        [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY]        = 12 + 8 + 4,
 197        [IB_OPCODE_RC_ACKNOWLEDGE]                    = 12 + 8 + 4,
 198        [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE]             = 12 + 8 + 4 + 8,
 199        [IB_OPCODE_RC_COMPARE_SWAP]                   = 12 + 8 + 28,
 200        [IB_OPCODE_RC_FETCH_ADD]                      = 12 + 8 + 28,
 201        [IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE]      = 12 + 8 + 4,
 202        [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE]      = 12 + 8 + 4,
 203        [IB_OPCODE_TID_RDMA_READ_REQ]                 = 12 + 8 + 36,
 204        [IB_OPCODE_TID_RDMA_READ_RESP]                = 12 + 8 + 36,
 205        [IB_OPCODE_TID_RDMA_WRITE_REQ]                = 12 + 8 + 36,
 206        [IB_OPCODE_TID_RDMA_WRITE_RESP]               = 12 + 8 + 36,
 207        [IB_OPCODE_TID_RDMA_WRITE_DATA]               = 12 + 8 + 36,
 208        [IB_OPCODE_TID_RDMA_WRITE_DATA_LAST]          = 12 + 8 + 36,
 209        [IB_OPCODE_TID_RDMA_ACK]                      = 12 + 8 + 36,
 210        [IB_OPCODE_TID_RDMA_RESYNC]                   = 12 + 8 + 36,
 211        /* UC */
 212        [IB_OPCODE_UC_SEND_FIRST]                     = 12 + 8,
 213        [IB_OPCODE_UC_SEND_MIDDLE]                    = 12 + 8,
 214        [IB_OPCODE_UC_SEND_LAST]                      = 12 + 8,
 215        [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE]       = 12 + 8 + 4,
 216        [IB_OPCODE_UC_SEND_ONLY]                      = 12 + 8,
 217        [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 4,
 218        [IB_OPCODE_UC_RDMA_WRITE_FIRST]               = 12 + 8 + 16,
 219        [IB_OPCODE_UC_RDMA_WRITE_MIDDLE]              = 12 + 8,
 220        [IB_OPCODE_UC_RDMA_WRITE_LAST]                = 12 + 8,
 221        [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
 222        [IB_OPCODE_UC_RDMA_WRITE_ONLY]                = 12 + 8 + 16,
 223        [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20,
 224        /* UD */
 225        [IB_OPCODE_UD_SEND_ONLY]                      = 12 + 8 + 8,
 226        [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 12
 227};
 228
 229static const opcode_handler opcode_handler_tbl[256] = {
 230        /* RC */
 231        [IB_OPCODE_RC_SEND_FIRST]                     = &hfi1_rc_rcv,
 232        [IB_OPCODE_RC_SEND_MIDDLE]                    = &hfi1_rc_rcv,
 233        [IB_OPCODE_RC_SEND_LAST]                      = &hfi1_rc_rcv,
 234        [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE]       = &hfi1_rc_rcv,
 235        [IB_OPCODE_RC_SEND_ONLY]                      = &hfi1_rc_rcv,
 236        [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_rc_rcv,
 237        [IB_OPCODE_RC_RDMA_WRITE_FIRST]               = &hfi1_rc_rcv,
 238        [IB_OPCODE_RC_RDMA_WRITE_MIDDLE]              = &hfi1_rc_rcv,
 239        [IB_OPCODE_RC_RDMA_WRITE_LAST]                = &hfi1_rc_rcv,
 240        [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_rc_rcv,
 241        [IB_OPCODE_RC_RDMA_WRITE_ONLY]                = &hfi1_rc_rcv,
 242        [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_rc_rcv,
 243        [IB_OPCODE_RC_RDMA_READ_REQUEST]              = &hfi1_rc_rcv,
 244        [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST]       = &hfi1_rc_rcv,
 245        [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE]      = &hfi1_rc_rcv,
 246        [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST]        = &hfi1_rc_rcv,
 247        [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY]        = &hfi1_rc_rcv,
 248        [IB_OPCODE_RC_ACKNOWLEDGE]                    = &hfi1_rc_rcv,
 249        [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE]             = &hfi1_rc_rcv,
 250        [IB_OPCODE_RC_COMPARE_SWAP]                   = &hfi1_rc_rcv,
 251        [IB_OPCODE_RC_FETCH_ADD]                      = &hfi1_rc_rcv,
 252        [IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE]      = &hfi1_rc_rcv,
 253        [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE]      = &hfi1_rc_rcv,
 254
 255        /* TID RDMA has separate handlers for different opcodes.*/
 256        [IB_OPCODE_TID_RDMA_WRITE_REQ]       = &hfi1_rc_rcv_tid_rdma_write_req,
 257        [IB_OPCODE_TID_RDMA_WRITE_RESP]      = &hfi1_rc_rcv_tid_rdma_write_resp,
 258        [IB_OPCODE_TID_RDMA_WRITE_DATA]      = &hfi1_rc_rcv_tid_rdma_write_data,
 259        [IB_OPCODE_TID_RDMA_WRITE_DATA_LAST] = &hfi1_rc_rcv_tid_rdma_write_data,
 260        [IB_OPCODE_TID_RDMA_READ_REQ]        = &hfi1_rc_rcv_tid_rdma_read_req,
 261        [IB_OPCODE_TID_RDMA_READ_RESP]       = &hfi1_rc_rcv_tid_rdma_read_resp,
 262        [IB_OPCODE_TID_RDMA_RESYNC]          = &hfi1_rc_rcv_tid_rdma_resync,
 263        [IB_OPCODE_TID_RDMA_ACK]             = &hfi1_rc_rcv_tid_rdma_ack,
 264
 265        /* UC */
 266        [IB_OPCODE_UC_SEND_FIRST]                     = &hfi1_uc_rcv,
 267        [IB_OPCODE_UC_SEND_MIDDLE]                    = &hfi1_uc_rcv,
 268        [IB_OPCODE_UC_SEND_LAST]                      = &hfi1_uc_rcv,
 269        [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE]       = &hfi1_uc_rcv,
 270        [IB_OPCODE_UC_SEND_ONLY]                      = &hfi1_uc_rcv,
 271        [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_uc_rcv,
 272        [IB_OPCODE_UC_RDMA_WRITE_FIRST]               = &hfi1_uc_rcv,
 273        [IB_OPCODE_UC_RDMA_WRITE_MIDDLE]              = &hfi1_uc_rcv,
 274        [IB_OPCODE_UC_RDMA_WRITE_LAST]                = &hfi1_uc_rcv,
 275        [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_uc_rcv,
 276        [IB_OPCODE_UC_RDMA_WRITE_ONLY]                = &hfi1_uc_rcv,
 277        [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_uc_rcv,
 278        /* UD */
 279        [IB_OPCODE_UD_SEND_ONLY]                      = &hfi1_ud_rcv,
 280        [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_ud_rcv,
 281        /* CNP */
 282        [IB_OPCODE_CNP]                               = &hfi1_cnp_rcv
 283};
 284
 285#define OPMASK 0x1f
 286
 287static const u32 pio_opmask[BIT(3)] = {
 288        /* RC */
 289        [IB_OPCODE_RC >> 5] =
 290                BIT(RC_OP(SEND_ONLY) & OPMASK) |
 291                BIT(RC_OP(SEND_ONLY_WITH_IMMEDIATE) & OPMASK) |
 292                BIT(RC_OP(RDMA_WRITE_ONLY) & OPMASK) |
 293                BIT(RC_OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE) & OPMASK) |
 294                BIT(RC_OP(RDMA_READ_REQUEST) & OPMASK) |
 295                BIT(RC_OP(ACKNOWLEDGE) & OPMASK) |
 296                BIT(RC_OP(ATOMIC_ACKNOWLEDGE) & OPMASK) |
 297                BIT(RC_OP(COMPARE_SWAP) & OPMASK) |
 298                BIT(RC_OP(FETCH_ADD) & OPMASK),
 299        /* UC */
 300        [IB_OPCODE_UC >> 5] =
 301                BIT(UC_OP(SEND_ONLY) & OPMASK) |
 302                BIT(UC_OP(SEND_ONLY_WITH_IMMEDIATE) & OPMASK) |
 303                BIT(UC_OP(RDMA_WRITE_ONLY) & OPMASK) |
 304                BIT(UC_OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE) & OPMASK),
 305};
 306
 307/*
 308 * System image GUID.
 309 */
 310__be64 ib_hfi1_sys_image_guid;
 311
 312/*
 313 * Make sure the QP is ready and able to accept the given opcode.
 314 */
 315static inline opcode_handler qp_ok(struct hfi1_packet *packet)
 316{
 317        if (!(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK))
 318                return NULL;
 319        if (((packet->opcode & RVT_OPCODE_QP_MASK) ==
 320             packet->qp->allowed_ops) ||
 321            (packet->opcode == IB_OPCODE_CNP))
 322                return opcode_handler_tbl[packet->opcode];
 323
 324        return NULL;
 325}
 326
 327static u64 hfi1_fault_tx(struct rvt_qp *qp, u8 opcode, u64 pbc)
 328{
 329#ifdef CONFIG_FAULT_INJECTION
 330        if ((opcode & IB_OPCODE_MSP) == IB_OPCODE_MSP) {
 331                /*
 332                 * In order to drop non-IB traffic we
 333                 * set PbcInsertHrc to NONE (0x2).
 334                 * The packet will still be delivered
 335                 * to the receiving node but a
 336                 * KHdrHCRCErr (KDETH packet with a bad
 337                 * HCRC) will be triggered and the
 338                 * packet will not be delivered to the
 339                 * correct context.
 340                 */
 341                pbc &= ~PBC_INSERT_HCRC_SMASK;
 342                pbc |= (u64)PBC_IHCRC_NONE << PBC_INSERT_HCRC_SHIFT;
 343        } else {
 344                /*
 345                 * In order to drop regular verbs
 346                 * traffic we set the PbcTestEbp
 347                 * flag. The packet will still be
 348                 * delivered to the receiving node but
 349                 * a 'late ebp error' will be
 350                 * triggered and will be dropped.
 351                 */
 352                pbc |= PBC_TEST_EBP;
 353        }
 354#endif
 355        return pbc;
 356}
 357
 358static opcode_handler tid_qp_ok(int opcode, struct hfi1_packet *packet)
 359{
 360        if (packet->qp->ibqp.qp_type != IB_QPT_RC ||
 361            !(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK))
 362                return NULL;
 363        if ((opcode & RVT_OPCODE_QP_MASK) == IB_OPCODE_TID_RDMA)
 364                return opcode_handler_tbl[opcode];
 365        return NULL;
 366}
 367
 368void hfi1_kdeth_eager_rcv(struct hfi1_packet *packet)
 369{
 370        struct hfi1_ctxtdata *rcd = packet->rcd;
 371        struct ib_header *hdr = packet->hdr;
 372        u32 tlen = packet->tlen;
 373        struct hfi1_pportdata *ppd = rcd->ppd;
 374        struct hfi1_ibport *ibp = &ppd->ibport_data;
 375        struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi;
 376        opcode_handler opcode_handler;
 377        unsigned long flags;
 378        u32 qp_num;
 379        int lnh;
 380        u8 opcode;
 381
 382        /* DW == LRH (2) + BTH (3) + KDETH (9) + CRC (1) */
 383        if (unlikely(tlen < 15 * sizeof(u32)))
 384                goto drop;
 385
 386        lnh = be16_to_cpu(hdr->lrh[0]) & 3;
 387        if (lnh != HFI1_LRH_BTH)
 388                goto drop;
 389
 390        packet->ohdr = &hdr->u.oth;
 391        trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf)));
 392
 393        opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24);
 394        inc_opstats(tlen, &rcd->opstats->stats[opcode]);
 395
 396        /* verbs_qp can be picked up from any tid_rdma header struct */
 397        qp_num = be32_to_cpu(packet->ohdr->u.tid_rdma.r_req.verbs_qp) &
 398                RVT_QPN_MASK;
 399
 400        rcu_read_lock();
 401        packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
 402        if (!packet->qp)
 403                goto drop_rcu;
 404        spin_lock_irqsave(&packet->qp->r_lock, flags);
 405        opcode_handler = tid_qp_ok(opcode, packet);
 406        if (likely(opcode_handler))
 407                opcode_handler(packet);
 408        else
 409                goto drop_unlock;
 410        spin_unlock_irqrestore(&packet->qp->r_lock, flags);
 411        rcu_read_unlock();
 412
 413        return;
 414drop_unlock:
 415        spin_unlock_irqrestore(&packet->qp->r_lock, flags);
 416drop_rcu:
 417        rcu_read_unlock();
 418drop:
 419        ibp->rvp.n_pkt_drops++;
 420}
 421
 422void hfi1_kdeth_expected_rcv(struct hfi1_packet *packet)
 423{
 424        struct hfi1_ctxtdata *rcd = packet->rcd;
 425        struct ib_header *hdr = packet->hdr;
 426        u32 tlen = packet->tlen;
 427        struct hfi1_pportdata *ppd = rcd->ppd;
 428        struct hfi1_ibport *ibp = &ppd->ibport_data;
 429        struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi;
 430        opcode_handler opcode_handler;
 431        unsigned long flags;
 432        u32 qp_num;
 433        int lnh;
 434        u8 opcode;
 435
 436        /* DW == LRH (2) + BTH (3) + KDETH (9) + CRC (1) */
 437        if (unlikely(tlen < 15 * sizeof(u32)))
 438                goto drop;
 439
 440        lnh = be16_to_cpu(hdr->lrh[0]) & 3;
 441        if (lnh != HFI1_LRH_BTH)
 442                goto drop;
 443
 444        packet->ohdr = &hdr->u.oth;
 445        trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf)));
 446
 447        opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24);
 448        inc_opstats(tlen, &rcd->opstats->stats[opcode]);
 449
 450        /* verbs_qp can be picked up from any tid_rdma header struct */
 451        qp_num = be32_to_cpu(packet->ohdr->u.tid_rdma.r_rsp.verbs_qp) &
 452                RVT_QPN_MASK;
 453
 454        rcu_read_lock();
 455        packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
 456        if (!packet->qp)
 457                goto drop_rcu;
 458        spin_lock_irqsave(&packet->qp->r_lock, flags);
 459        opcode_handler = tid_qp_ok(opcode, packet);
 460        if (likely(opcode_handler))
 461                opcode_handler(packet);
 462        else
 463                goto drop_unlock;
 464        spin_unlock_irqrestore(&packet->qp->r_lock, flags);
 465        rcu_read_unlock();
 466
 467        return;
 468drop_unlock:
 469        spin_unlock_irqrestore(&packet->qp->r_lock, flags);
 470drop_rcu:
 471        rcu_read_unlock();
 472drop:
 473        ibp->rvp.n_pkt_drops++;
 474}
 475
 476static int hfi1_do_pkey_check(struct hfi1_packet *packet)
 477{
 478        struct hfi1_ctxtdata *rcd = packet->rcd;
 479        struct hfi1_pportdata *ppd = rcd->ppd;
 480        struct hfi1_16b_header *hdr = packet->hdr;
 481        u16 pkey;
 482
 483        /* Pkey check needed only for bypass packets */
 484        if (packet->etype != RHF_RCV_TYPE_BYPASS)
 485                return 0;
 486
 487        /* Perform pkey check */
 488        pkey = hfi1_16B_get_pkey(hdr);
 489        return ingress_pkey_check(ppd, pkey, packet->sc,
 490                                  packet->qp->s_pkey_index,
 491                                  packet->slid, true);
 492}
 493
 494static inline void hfi1_handle_packet(struct hfi1_packet *packet,
 495                                      bool is_mcast)
 496{
 497        u32 qp_num;
 498        struct hfi1_ctxtdata *rcd = packet->rcd;
 499        struct hfi1_pportdata *ppd = rcd->ppd;
 500        struct hfi1_ibport *ibp = rcd_to_iport(rcd);
 501        struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi;
 502        opcode_handler packet_handler;
 503        unsigned long flags;
 504
 505        inc_opstats(packet->tlen, &rcd->opstats->stats[packet->opcode]);
 506
 507        if (unlikely(is_mcast)) {
 508                struct rvt_mcast *mcast;
 509                struct rvt_mcast_qp *p;
 510
 511                if (!packet->grh)
 512                        goto drop;
 513                mcast = rvt_mcast_find(&ibp->rvp,
 514                                       &packet->grh->dgid,
 515                                       opa_get_lid(packet->dlid, 9B));
 516                if (!mcast)
 517                        goto drop;
 518                list_for_each_entry_rcu(p, &mcast->qp_list, list) {
 519                        packet->qp = p->qp;
 520                        if (hfi1_do_pkey_check(packet))
 521                                goto drop;
 522                        spin_lock_irqsave(&packet->qp->r_lock, flags);
 523                        packet_handler = qp_ok(packet);
 524                        if (likely(packet_handler))
 525                                packet_handler(packet);
 526                        else
 527                                ibp->rvp.n_pkt_drops++;
 528                        spin_unlock_irqrestore(&packet->qp->r_lock, flags);
 529                }
 530                /*
 531                 * Notify rvt_multicast_detach() if it is waiting for us
 532                 * to finish.
 533                 */
 534                if (atomic_dec_return(&mcast->refcount) <= 1)
 535                        wake_up(&mcast->wait);
 536        } else {
 537                /* Get the destination QP number. */
 538                if (packet->etype == RHF_RCV_TYPE_BYPASS &&
 539                    hfi1_16B_get_l4(packet->hdr) == OPA_16B_L4_FM)
 540                        qp_num = hfi1_16B_get_dest_qpn(packet->mgmt);
 541                else
 542                        qp_num = ib_bth_get_qpn(packet->ohdr);
 543
 544                rcu_read_lock();
 545                packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
 546                if (!packet->qp)
 547                        goto unlock_drop;
 548
 549                if (hfi1_do_pkey_check(packet))
 550                        goto unlock_drop;
 551
 552                spin_lock_irqsave(&packet->qp->r_lock, flags);
 553                packet_handler = qp_ok(packet);
 554                if (likely(packet_handler))
 555                        packet_handler(packet);
 556                else
 557                        ibp->rvp.n_pkt_drops++;
 558                spin_unlock_irqrestore(&packet->qp->r_lock, flags);
 559                rcu_read_unlock();
 560        }
 561        return;
 562unlock_drop:
 563        rcu_read_unlock();
 564drop:
 565        ibp->rvp.n_pkt_drops++;
 566}
 567
 568/**
 569 * hfi1_ib_rcv - process an incoming packet
 570 * @packet: data packet information
 571 *
 572 * This is called to process an incoming packet at interrupt level.
 573 */
 574void hfi1_ib_rcv(struct hfi1_packet *packet)
 575{
 576        struct hfi1_ctxtdata *rcd = packet->rcd;
 577
 578        trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf)));
 579        hfi1_handle_packet(packet, hfi1_check_mcast(packet->dlid));
 580}
 581
 582void hfi1_16B_rcv(struct hfi1_packet *packet)
 583{
 584        struct hfi1_ctxtdata *rcd = packet->rcd;
 585
 586        trace_input_ibhdr(rcd->dd, packet, false);
 587        hfi1_handle_packet(packet, hfi1_check_mcast(packet->dlid));
 588}
 589
 590/*
 591 * This is called from a timer to check for QPs
 592 * which need kernel memory in order to send a packet.
 593 */
 594static void mem_timer(struct timer_list *t)
 595{
 596        struct hfi1_ibdev *dev = from_timer(dev, t, mem_timer);
 597        struct list_head *list = &dev->memwait;
 598        struct rvt_qp *qp = NULL;
 599        struct iowait *wait;
 600        unsigned long flags;
 601        struct hfi1_qp_priv *priv;
 602
 603        write_seqlock_irqsave(&dev->iowait_lock, flags);
 604        if (!list_empty(list)) {
 605                wait = list_first_entry(list, struct iowait, list);
 606                qp = iowait_to_qp(wait);
 607                priv = qp->priv;
 608                list_del_init(&priv->s_iowait.list);
 609                priv->s_iowait.lock = NULL;
 610                /* refcount held until actual wake up */
 611                if (!list_empty(list))
 612                        mod_timer(&dev->mem_timer, jiffies + 1);
 613        }
 614        write_sequnlock_irqrestore(&dev->iowait_lock, flags);
 615
 616        if (qp)
 617                hfi1_qp_wakeup(qp, RVT_S_WAIT_KMEM);
 618}
 619
 620/*
 621 * This is called with progress side lock held.
 622 */
 623/* New API */
 624static void verbs_sdma_complete(
 625        struct sdma_txreq *cookie,
 626        int status)
 627{
 628        struct verbs_txreq *tx =
 629                container_of(cookie, struct verbs_txreq, txreq);
 630        struct rvt_qp *qp = tx->qp;
 631
 632        spin_lock(&qp->s_lock);
 633        if (tx->wqe) {
 634                rvt_send_complete(qp, tx->wqe, IB_WC_SUCCESS);
 635        } else if (qp->ibqp.qp_type == IB_QPT_RC) {
 636                struct hfi1_opa_header *hdr;
 637
 638                hdr = &tx->phdr.hdr;
 639                if (unlikely(status == SDMA_TXREQ_S_ABORTED))
 640                        hfi1_rc_verbs_aborted(qp, hdr);
 641                hfi1_rc_send_complete(qp, hdr);
 642        }
 643        spin_unlock(&qp->s_lock);
 644
 645        hfi1_put_txreq(tx);
 646}
 647
 648void hfi1_wait_kmem(struct rvt_qp *qp)
 649{
 650        struct hfi1_qp_priv *priv = qp->priv;
 651        struct ib_qp *ibqp = &qp->ibqp;
 652        struct ib_device *ibdev = ibqp->device;
 653        struct hfi1_ibdev *dev = to_idev(ibdev);
 654
 655        if (list_empty(&priv->s_iowait.list)) {
 656                if (list_empty(&dev->memwait))
 657                        mod_timer(&dev->mem_timer, jiffies + 1);
 658                qp->s_flags |= RVT_S_WAIT_KMEM;
 659                list_add_tail(&priv->s_iowait.list, &dev->memwait);
 660                priv->s_iowait.lock = &dev->iowait_lock;
 661                trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM);
 662                rvt_get_qp(qp);
 663        }
 664}
 665
 666static int wait_kmem(struct hfi1_ibdev *dev,
 667                     struct rvt_qp *qp,
 668                     struct hfi1_pkt_state *ps)
 669{
 670        unsigned long flags;
 671        int ret = 0;
 672
 673        spin_lock_irqsave(&qp->s_lock, flags);
 674        if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
 675                write_seqlock(&dev->iowait_lock);
 676                list_add_tail(&ps->s_txreq->txreq.list,
 677                              &ps->wait->tx_head);
 678                hfi1_wait_kmem(qp);
 679                write_sequnlock(&dev->iowait_lock);
 680                hfi1_qp_unbusy(qp, ps->wait);
 681                ret = -EBUSY;
 682        }
 683        spin_unlock_irqrestore(&qp->s_lock, flags);
 684
 685        return ret;
 686}
 687
 688/*
 689 * This routine calls txadds for each sg entry.
 690 *
 691 * Add failures will revert the sge cursor
 692 */
 693static noinline int build_verbs_ulp_payload(
 694        struct sdma_engine *sde,
 695        u32 length,
 696        struct verbs_txreq *tx)
 697{
 698        struct rvt_sge_state *ss = tx->ss;
 699        struct rvt_sge *sg_list = ss->sg_list;
 700        struct rvt_sge sge = ss->sge;
 701        u8 num_sge = ss->num_sge;
 702        u32 len;
 703        int ret = 0;
 704
 705        while (length) {
 706                len = rvt_get_sge_length(&ss->sge, length);
 707                WARN_ON_ONCE(len == 0);
 708                ret = sdma_txadd_kvaddr(
 709                        sde->dd,
 710                        &tx->txreq,
 711                        ss->sge.vaddr,
 712                        len);
 713                if (ret)
 714                        goto bail_txadd;
 715                rvt_update_sge(ss, len, false);
 716                length -= len;
 717        }
 718        return ret;
 719bail_txadd:
 720        /* unwind cursor */
 721        ss->sge = sge;
 722        ss->num_sge = num_sge;
 723        ss->sg_list = sg_list;
 724        return ret;
 725}
 726
 727/**
 728 * update_tx_opstats - record stats by opcode
 729 * @qp; the qp
 730 * @ps: transmit packet state
 731 * @plen: the plen in dwords
 732 *
 733 * This is a routine to record the tx opstats after a
 734 * packet has been presented to the egress mechanism.
 735 */
 736static void update_tx_opstats(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 737                              u32 plen)
 738{
 739#ifdef CONFIG_DEBUG_FS
 740        struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
 741        struct hfi1_opcode_stats_perctx *s = get_cpu_ptr(dd->tx_opstats);
 742
 743        inc_opstats(plen * 4, &s->stats[ps->opcode]);
 744        put_cpu_ptr(s);
 745#endif
 746}
 747
 748/*
 749 * Build the number of DMA descriptors needed to send length bytes of data.
 750 *
 751 * NOTE: DMA mapping is held in the tx until completed in the ring or
 752 *       the tx desc is freed without having been submitted to the ring
 753 *
 754 * This routine ensures all the helper routine calls succeed.
 755 */
 756/* New API */
 757static int build_verbs_tx_desc(
 758        struct sdma_engine *sde,
 759        u32 length,
 760        struct verbs_txreq *tx,
 761        struct hfi1_ahg_info *ahg_info,
 762        u64 pbc)
 763{
 764        int ret = 0;
 765        struct hfi1_sdma_header *phdr = &tx->phdr;
 766        u16 hdrbytes = (tx->hdr_dwords + sizeof(pbc) / 4) << 2;
 767        u8 extra_bytes = 0;
 768
 769        if (tx->phdr.hdr.hdr_type) {
 770                /*
 771                 * hdrbytes accounts for PBC. Need to subtract 8 bytes
 772                 * before calculating padding.
 773                 */
 774                extra_bytes = hfi1_get_16b_padding(hdrbytes - 8, length) +
 775                              (SIZE_OF_CRC << 2) + SIZE_OF_LT;
 776        }
 777        if (!ahg_info->ahgcount) {
 778                ret = sdma_txinit_ahg(
 779                        &tx->txreq,
 780                        ahg_info->tx_flags,
 781                        hdrbytes + length +
 782                        extra_bytes,
 783                        ahg_info->ahgidx,
 784                        0,
 785                        NULL,
 786                        0,
 787                        verbs_sdma_complete);
 788                if (ret)
 789                        goto bail_txadd;
 790                phdr->pbc = cpu_to_le64(pbc);
 791                ret = sdma_txadd_kvaddr(
 792                        sde->dd,
 793                        &tx->txreq,
 794                        phdr,
 795                        hdrbytes);
 796                if (ret)
 797                        goto bail_txadd;
 798        } else {
 799                ret = sdma_txinit_ahg(
 800                        &tx->txreq,
 801                        ahg_info->tx_flags,
 802                        length,
 803                        ahg_info->ahgidx,
 804                        ahg_info->ahgcount,
 805                        ahg_info->ahgdesc,
 806                        hdrbytes,
 807                        verbs_sdma_complete);
 808                if (ret)
 809                        goto bail_txadd;
 810        }
 811        /* add the ulp payload - if any. tx->ss can be NULL for acks */
 812        if (tx->ss) {
 813                ret = build_verbs_ulp_payload(sde, length, tx);
 814                if (ret)
 815                        goto bail_txadd;
 816        }
 817
 818        /* add icrc, lt byte, and padding to flit */
 819        if (extra_bytes)
 820                ret = sdma_txadd_daddr(sde->dd, &tx->txreq,
 821                                       sde->dd->sdma_pad_phys, extra_bytes);
 822
 823bail_txadd:
 824        return ret;
 825}
 826
 827static u64 update_hcrc(u8 opcode, u64 pbc)
 828{
 829        if ((opcode & IB_OPCODE_TID_RDMA) == IB_OPCODE_TID_RDMA) {
 830                pbc &= ~PBC_INSERT_HCRC_SMASK;
 831                pbc |= (u64)PBC_IHCRC_LKDETH << PBC_INSERT_HCRC_SHIFT;
 832        }
 833        return pbc;
 834}
 835
 836int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 837                        u64 pbc)
 838{
 839        struct hfi1_qp_priv *priv = qp->priv;
 840        struct hfi1_ahg_info *ahg_info = priv->s_ahg;
 841        u32 hdrwords = ps->s_txreq->hdr_dwords;
 842        u32 len = ps->s_txreq->s_cur_size;
 843        u32 plen;
 844        struct hfi1_ibdev *dev = ps->dev;
 845        struct hfi1_pportdata *ppd = ps->ppd;
 846        struct verbs_txreq *tx;
 847        u8 sc5 = priv->s_sc;
 848        int ret;
 849        u32 dwords;
 850
 851        if (ps->s_txreq->phdr.hdr.hdr_type) {
 852                u8 extra_bytes = hfi1_get_16b_padding((hdrwords << 2), len);
 853
 854                dwords = (len + extra_bytes + (SIZE_OF_CRC << 2) +
 855                          SIZE_OF_LT) >> 2;
 856        } else {
 857                dwords = (len + 3) >> 2;
 858        }
 859        plen = hdrwords + dwords + sizeof(pbc) / 4;
 860
 861        tx = ps->s_txreq;
 862        if (!sdma_txreq_built(&tx->txreq)) {
 863                if (likely(pbc == 0)) {
 864                        u32 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5);
 865
 866                        /* No vl15 here */
 867                        /* set PBC_DC_INFO bit (aka SC[4]) in pbc */
 868                        if (ps->s_txreq->phdr.hdr.hdr_type)
 869                                pbc |= PBC_PACKET_BYPASS |
 870                                       PBC_INSERT_BYPASS_ICRC;
 871                        else
 872                                pbc |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT);
 873
 874                        pbc = create_pbc(ppd,
 875                                         pbc,
 876                                         qp->srate_mbps,
 877                                         vl,
 878                                         plen);
 879
 880                        if (unlikely(hfi1_dbg_should_fault_tx(qp, ps->opcode)))
 881                                pbc = hfi1_fault_tx(qp, ps->opcode, pbc);
 882                        else
 883                                /* Update HCRC based on packet opcode */
 884                                pbc = update_hcrc(ps->opcode, pbc);
 885                }
 886                tx->wqe = qp->s_wqe;
 887                ret = build_verbs_tx_desc(tx->sde, len, tx, ahg_info, pbc);
 888                if (unlikely(ret))
 889                        goto bail_build;
 890        }
 891        ret =  sdma_send_txreq(tx->sde, ps->wait, &tx->txreq, ps->pkts_sent);
 892        if (unlikely(ret < 0)) {
 893                if (ret == -ECOMM)
 894                        goto bail_ecomm;
 895                return ret;
 896        }
 897
 898        update_tx_opstats(qp, ps, plen);
 899        trace_sdma_output_ibhdr(dd_from_ibdev(qp->ibqp.device),
 900                                &ps->s_txreq->phdr.hdr, ib_is_sc5(sc5));
 901        return ret;
 902
 903bail_ecomm:
 904        /* The current one got "sent" */
 905        return 0;
 906bail_build:
 907        ret = wait_kmem(dev, qp, ps);
 908        if (!ret) {
 909                /* free txreq - bad state */
 910                hfi1_put_txreq(ps->s_txreq);
 911                ps->s_txreq = NULL;
 912        }
 913        return ret;
 914}
 915
 916/*
 917 * If we are now in the error state, return zero to flush the
 918 * send work request.
 919 */
 920static int pio_wait(struct rvt_qp *qp,
 921                    struct send_context *sc,
 922                    struct hfi1_pkt_state *ps,
 923                    u32 flag)
 924{
 925        struct hfi1_qp_priv *priv = qp->priv;
 926        struct hfi1_devdata *dd = sc->dd;
 927        unsigned long flags;
 928        int ret = 0;
 929
 930        /*
 931         * Note that as soon as want_buffer() is called and
 932         * possibly before it returns, sc_piobufavail()
 933         * could be called. Therefore, put QP on the I/O wait list before
 934         * enabling the PIO avail interrupt.
 935         */
 936        spin_lock_irqsave(&qp->s_lock, flags);
 937        if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
 938                write_seqlock(&sc->waitlock);
 939                list_add_tail(&ps->s_txreq->txreq.list,
 940                              &ps->wait->tx_head);
 941                if (list_empty(&priv->s_iowait.list)) {
 942                        struct hfi1_ibdev *dev = &dd->verbs_dev;
 943                        int was_empty;
 944
 945                        dev->n_piowait += !!(flag & RVT_S_WAIT_PIO);
 946                        dev->n_piodrain += !!(flag & HFI1_S_WAIT_PIO_DRAIN);
 947                        qp->s_flags |= flag;
 948                        was_empty = list_empty(&sc->piowait);
 949                        iowait_get_priority(&priv->s_iowait);
 950                        iowait_queue(ps->pkts_sent, &priv->s_iowait,
 951                                     &sc->piowait);
 952                        priv->s_iowait.lock = &sc->waitlock;
 953                        trace_hfi1_qpsleep(qp, RVT_S_WAIT_PIO);
 954                        rvt_get_qp(qp);
 955                        /* counting: only call wantpiobuf_intr if first user */
 956                        if (was_empty)
 957                                hfi1_sc_wantpiobuf_intr(sc, 1);
 958                }
 959                write_sequnlock(&sc->waitlock);
 960                hfi1_qp_unbusy(qp, ps->wait);
 961                ret = -EBUSY;
 962        }
 963        spin_unlock_irqrestore(&qp->s_lock, flags);
 964        return ret;
 965}
 966
 967static void verbs_pio_complete(void *arg, int code)
 968{
 969        struct rvt_qp *qp = (struct rvt_qp *)arg;
 970        struct hfi1_qp_priv *priv = qp->priv;
 971
 972        if (iowait_pio_dec(&priv->s_iowait))
 973                iowait_drain_wakeup(&priv->s_iowait);
 974}
 975
 976int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 977                        u64 pbc)
 978{
 979        struct hfi1_qp_priv *priv = qp->priv;
 980        u32 hdrwords = ps->s_txreq->hdr_dwords;
 981        struct rvt_sge_state *ss = ps->s_txreq->ss;
 982        u32 len = ps->s_txreq->s_cur_size;
 983        u32 dwords;
 984        u32 plen;
 985        struct hfi1_pportdata *ppd = ps->ppd;
 986        u32 *hdr;
 987        u8 sc5;
 988        unsigned long flags = 0;
 989        struct send_context *sc;
 990        struct pio_buf *pbuf;
 991        int wc_status = IB_WC_SUCCESS;
 992        int ret = 0;
 993        pio_release_cb cb = NULL;
 994        u8 extra_bytes = 0;
 995
 996        if (ps->s_txreq->phdr.hdr.hdr_type) {
 997                u8 pad_size = hfi1_get_16b_padding((hdrwords << 2), len);
 998
 999                extra_bytes = pad_size + (SIZE_OF_CRC << 2) + SIZE_OF_LT;
1000                dwords = (len + extra_bytes) >> 2;
1001                hdr = (u32 *)&ps->s_txreq->phdr.hdr.opah;
1002        } else {
1003                dwords = (len + 3) >> 2;
1004                hdr = (u32 *)&ps->s_txreq->phdr.hdr.ibh;
1005        }
1006        plen = hdrwords + dwords + sizeof(pbc) / 4;
1007
1008        /* only RC/UC use complete */
1009        switch (qp->ibqp.qp_type) {
1010        case IB_QPT_RC:
1011        case IB_QPT_UC:
1012                cb = verbs_pio_complete;
1013                break;
1014        default:
1015                break;
1016        }
1017
1018        /* vl15 special case taken care of in ud.c */
1019        sc5 = priv->s_sc;
1020        sc = ps->s_txreq->psc;
1021
1022        if (likely(pbc == 0)) {
1023                u8 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5);
1024
1025                /* set PBC_DC_INFO bit (aka SC[4]) in pbc */
1026                if (ps->s_txreq->phdr.hdr.hdr_type)
1027                        pbc |= PBC_PACKET_BYPASS | PBC_INSERT_BYPASS_ICRC;
1028                else
1029                        pbc |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT);
1030
1031                pbc = create_pbc(ppd, pbc, qp->srate_mbps, vl, plen);
1032                if (unlikely(hfi1_dbg_should_fault_tx(qp, ps->opcode)))
1033                        pbc = hfi1_fault_tx(qp, ps->opcode, pbc);
1034                else
1035                        /* Update HCRC based on packet opcode */
1036                        pbc = update_hcrc(ps->opcode, pbc);
1037        }
1038        if (cb)
1039                iowait_pio_inc(&priv->s_iowait);
1040        pbuf = sc_buffer_alloc(sc, plen, cb, qp);
1041        if (IS_ERR_OR_NULL(pbuf)) {
1042                if (cb)
1043                        verbs_pio_complete(qp, 0);
1044                if (IS_ERR(pbuf)) {
1045                        /*
1046                         * If we have filled the PIO buffers to capacity and are
1047                         * not in an active state this request is not going to
1048                         * go out to so just complete it with an error or else a
1049                         * ULP or the core may be stuck waiting.
1050                         */
1051                        hfi1_cdbg(
1052                                PIO,
1053                                "alloc failed. state not active, completing");
1054                        wc_status = IB_WC_GENERAL_ERR;
1055                        goto pio_bail;
1056                } else {
1057                        /*
1058                         * This is a normal occurrence. The PIO buffs are full
1059                         * up but we are still happily sending, well we could be
1060                         * so lets continue to queue the request.
1061                         */
1062                        hfi1_cdbg(PIO, "alloc failed. state active, queuing");
1063                        ret = pio_wait(qp, sc, ps, RVT_S_WAIT_PIO);
1064                        if (!ret)
1065                                /* txreq not queued - free */
1066                                goto bail;
1067                        /* tx consumed in wait */
1068                        return ret;
1069                }
1070        }
1071
1072        if (dwords == 0) {
1073                pio_copy(ppd->dd, pbuf, pbc, hdr, hdrwords);
1074        } else {
1075                seg_pio_copy_start(pbuf, pbc,
1076                                   hdr, hdrwords * 4);
1077                if (ss) {
1078                        while (len) {
1079                                void *addr = ss->sge.vaddr;
1080                                u32 slen = rvt_get_sge_length(&ss->sge, len);
1081
1082                                rvt_update_sge(ss, slen, false);
1083                                seg_pio_copy_mid(pbuf, addr, slen);
1084                                len -= slen;
1085                        }
1086                }
1087                /* add icrc, lt byte, and padding to flit */
1088                if (extra_bytes)
1089                        seg_pio_copy_mid(pbuf, ppd->dd->sdma_pad_dma,
1090                                         extra_bytes);
1091
1092                seg_pio_copy_end(pbuf);
1093        }
1094
1095        update_tx_opstats(qp, ps, plen);
1096        trace_pio_output_ibhdr(dd_from_ibdev(qp->ibqp.device),
1097                               &ps->s_txreq->phdr.hdr, ib_is_sc5(sc5));
1098
1099pio_bail:
1100        spin_lock_irqsave(&qp->s_lock, flags);
1101        if (qp->s_wqe) {
1102                rvt_send_complete(qp, qp->s_wqe, wc_status);
1103        } else if (qp->ibqp.qp_type == IB_QPT_RC) {
1104                if (unlikely(wc_status == IB_WC_GENERAL_ERR))
1105                        hfi1_rc_verbs_aborted(qp, &ps->s_txreq->phdr.hdr);
1106                hfi1_rc_send_complete(qp, &ps->s_txreq->phdr.hdr);
1107        }
1108        spin_unlock_irqrestore(&qp->s_lock, flags);
1109
1110        ret = 0;
1111
1112bail:
1113        hfi1_put_txreq(ps->s_txreq);
1114        return ret;
1115}
1116
1117/*
1118 * egress_pkey_matches_entry - return 1 if the pkey matches ent (ent
1119 * being an entry from the partition key table), return 0
1120 * otherwise. Use the matching criteria for egress partition keys
1121 * specified in the OPAv1 spec., section 9.1l.7.
1122 */
1123static inline int egress_pkey_matches_entry(u16 pkey, u16 ent)
1124{
1125        u16 mkey = pkey & PKEY_LOW_15_MASK;
1126        u16 mentry = ent & PKEY_LOW_15_MASK;
1127
1128        if (mkey == mentry) {
1129                /*
1130                 * If pkey[15] is set (full partition member),
1131                 * is bit 15 in the corresponding table element
1132                 * clear (limited member)?
1133                 */
1134                if (pkey & PKEY_MEMBER_MASK)
1135                        return !!(ent & PKEY_MEMBER_MASK);
1136                return 1;
1137        }
1138        return 0;
1139}
1140
1141/**
1142 * egress_pkey_check - check P_KEY of a packet
1143 * @ppd:  Physical IB port data
1144 * @slid: SLID for packet
1145 * @bkey: PKEY for header
1146 * @sc5:  SC for packet
1147 * @s_pkey_index: It will be used for look up optimization for kernel contexts
1148 * only. If it is negative value, then it means user contexts is calling this
1149 * function.
1150 *
1151 * It checks if hdr's pkey is valid.
1152 *
1153 * Return: 0 on success, otherwise, 1
1154 */
1155int egress_pkey_check(struct hfi1_pportdata *ppd, u32 slid, u16 pkey,
1156                      u8 sc5, int8_t s_pkey_index)
1157{
1158        struct hfi1_devdata *dd;
1159        int i;
1160        int is_user_ctxt_mechanism = (s_pkey_index < 0);
1161
1162        if (!(ppd->part_enforce & HFI1_PART_ENFORCE_OUT))
1163                return 0;
1164
1165        /* If SC15, pkey[0:14] must be 0x7fff */
1166        if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
1167                goto bad;
1168
1169        /* Is the pkey = 0x0, or 0x8000? */
1170        if ((pkey & PKEY_LOW_15_MASK) == 0)
1171                goto bad;
1172
1173        /*
1174         * For the kernel contexts only, if a qp is passed into the function,
1175         * the most likely matching pkey has index qp->s_pkey_index
1176         */
1177        if (!is_user_ctxt_mechanism &&
1178            egress_pkey_matches_entry(pkey, ppd->pkeys[s_pkey_index])) {
1179                return 0;
1180        }
1181
1182        for (i = 0; i < MAX_PKEY_VALUES; i++) {
1183                if (egress_pkey_matches_entry(pkey, ppd->pkeys[i]))
1184                        return 0;
1185        }
1186bad:
1187        /*
1188         * For the user-context mechanism, the P_KEY check would only happen
1189         * once per SDMA request, not once per packet.  Therefore, there's no
1190         * need to increment the counter for the user-context mechanism.
1191         */
1192        if (!is_user_ctxt_mechanism) {
1193                incr_cntr64(&ppd->port_xmit_constraint_errors);
1194                dd = ppd->dd;
1195                if (!(dd->err_info_xmit_constraint.status &
1196                      OPA_EI_STATUS_SMASK)) {
1197                        dd->err_info_xmit_constraint.status |=
1198                                OPA_EI_STATUS_SMASK;
1199                        dd->err_info_xmit_constraint.slid = slid;
1200                        dd->err_info_xmit_constraint.pkey = pkey;
1201                }
1202        }
1203        return 1;
1204}
1205
1206/**
1207 * get_send_routine - choose an egress routine
1208 *
1209 * Choose an egress routine based on QP type
1210 * and size
1211 */
1212static inline send_routine get_send_routine(struct rvt_qp *qp,
1213                                            struct hfi1_pkt_state *ps)
1214{
1215        struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
1216        struct hfi1_qp_priv *priv = qp->priv;
1217        struct verbs_txreq *tx = ps->s_txreq;
1218
1219        if (unlikely(!(dd->flags & HFI1_HAS_SEND_DMA)))
1220                return dd->process_pio_send;
1221        switch (qp->ibqp.qp_type) {
1222        case IB_QPT_SMI:
1223                return dd->process_pio_send;
1224        case IB_QPT_GSI:
1225        case IB_QPT_UD:
1226                break;
1227        case IB_QPT_UC:
1228        case IB_QPT_RC:
1229                priv->s_running_pkt_size =
1230                        (tx->s_cur_size + priv->s_running_pkt_size) / 2;
1231                if (piothreshold &&
1232                    priv->s_running_pkt_size <= min(piothreshold, qp->pmtu) &&
1233                    (BIT(ps->opcode & OPMASK) & pio_opmask[ps->opcode >> 5]) &&
1234                    iowait_sdma_pending(&priv->s_iowait) == 0 &&
1235                    !sdma_txreq_built(&tx->txreq))
1236                        return dd->process_pio_send;
1237                break;
1238        default:
1239                break;
1240        }
1241        return dd->process_dma_send;
1242}
1243
1244/**
1245 * hfi1_verbs_send - send a packet
1246 * @qp: the QP to send on
1247 * @ps: the state of the packet to send
1248 *
1249 * Return zero if packet is sent or queued OK.
1250 * Return non-zero and clear qp->s_flags RVT_S_BUSY otherwise.
1251 */
1252int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
1253{
1254        struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
1255        struct hfi1_qp_priv *priv = qp->priv;
1256        struct ib_other_headers *ohdr = NULL;
1257        send_routine sr;
1258        int ret;
1259        u16 pkey;
1260        u32 slid;
1261        u8 l4 = 0;
1262
1263        /* locate the pkey within the headers */
1264        if (ps->s_txreq->phdr.hdr.hdr_type) {
1265                struct hfi1_16b_header *hdr = &ps->s_txreq->phdr.hdr.opah;
1266
1267                l4 = hfi1_16B_get_l4(hdr);
1268                if (l4 == OPA_16B_L4_IB_LOCAL)
1269                        ohdr = &hdr->u.oth;
1270                else if (l4 == OPA_16B_L4_IB_GLOBAL)
1271                        ohdr = &hdr->u.l.oth;
1272
1273                slid = hfi1_16B_get_slid(hdr);
1274                pkey = hfi1_16B_get_pkey(hdr);
1275        } else {
1276                struct ib_header *hdr = &ps->s_txreq->phdr.hdr.ibh;
1277                u8 lnh = ib_get_lnh(hdr);
1278
1279                if (lnh == HFI1_LRH_GRH)
1280                        ohdr = &hdr->u.l.oth;
1281                else
1282                        ohdr = &hdr->u.oth;
1283                slid = ib_get_slid(hdr);
1284                pkey = ib_bth_get_pkey(ohdr);
1285        }
1286
1287        if (likely(l4 != OPA_16B_L4_FM))
1288                ps->opcode = ib_bth_get_opcode(ohdr);
1289        else
1290                ps->opcode = IB_OPCODE_UD_SEND_ONLY;
1291
1292        sr = get_send_routine(qp, ps);
1293        ret = egress_pkey_check(dd->pport, slid, pkey,
1294                                priv->s_sc, qp->s_pkey_index);
1295        if (unlikely(ret)) {
1296                /*
1297                 * The value we are returning here does not get propagated to
1298                 * the verbs caller. Thus we need to complete the request with
1299                 * error otherwise the caller could be sitting waiting on the
1300                 * completion event. Only do this for PIO. SDMA has its own
1301                 * mechanism for handling the errors. So for SDMA we can just
1302                 * return.
1303                 */
1304                if (sr == dd->process_pio_send) {
1305                        unsigned long flags;
1306
1307                        hfi1_cdbg(PIO, "%s() Failed. Completing with err",
1308                                  __func__);
1309                        spin_lock_irqsave(&qp->s_lock, flags);
1310                        rvt_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR);
1311                        spin_unlock_irqrestore(&qp->s_lock, flags);
1312                }
1313                return -EINVAL;
1314        }
1315        if (sr == dd->process_dma_send && iowait_pio_pending(&priv->s_iowait))
1316                return pio_wait(qp,
1317                                ps->s_txreq->psc,
1318                                ps,
1319                                HFI1_S_WAIT_PIO_DRAIN);
1320        return sr(qp, ps, 0);
1321}
1322
1323/**
1324 * hfi1_fill_device_attr - Fill in rvt dev info device attributes.
1325 * @dd: the device data structure
1326 */
1327static void hfi1_fill_device_attr(struct hfi1_devdata *dd)
1328{
1329        struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
1330        u32 ver = dd->dc8051_ver;
1331
1332        memset(&rdi->dparms.props, 0, sizeof(rdi->dparms.props));
1333
1334        rdi->dparms.props.fw_ver = ((u64)(dc8051_ver_maj(ver)) << 32) |
1335                ((u64)(dc8051_ver_min(ver)) << 16) |
1336                (u64)dc8051_ver_patch(ver);
1337
1338        rdi->dparms.props.device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
1339                        IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
1340                        IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
1341                        IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE |
1342                        IB_DEVICE_MEM_MGT_EXTENSIONS |
1343                        IB_DEVICE_RDMA_NETDEV_OPA_VNIC;
1344        rdi->dparms.props.page_size_cap = PAGE_SIZE;
1345        rdi->dparms.props.vendor_id = dd->oui1 << 16 | dd->oui2 << 8 | dd->oui3;
1346        rdi->dparms.props.vendor_part_id = dd->pcidev->device;
1347        rdi->dparms.props.hw_ver = dd->minrev;
1348        rdi->dparms.props.sys_image_guid = ib_hfi1_sys_image_guid;
1349        rdi->dparms.props.max_mr_size = U64_MAX;
1350        rdi->dparms.props.max_fast_reg_page_list_len = UINT_MAX;
1351        rdi->dparms.props.max_qp = hfi1_max_qps;
1352        rdi->dparms.props.max_qp_wr =
1353                (hfi1_max_qp_wrs >= HFI1_QP_WQE_INVALID ?
1354                 HFI1_QP_WQE_INVALID - 1 : hfi1_max_qp_wrs);
1355        rdi->dparms.props.max_send_sge = hfi1_max_sges;
1356        rdi->dparms.props.max_recv_sge = hfi1_max_sges;
1357        rdi->dparms.props.max_sge_rd = hfi1_max_sges;
1358        rdi->dparms.props.max_cq = hfi1_max_cqs;
1359        rdi->dparms.props.max_ah = hfi1_max_ahs;
1360        rdi->dparms.props.max_cqe = hfi1_max_cqes;
1361        rdi->dparms.props.max_map_per_fmr = 32767;
1362        rdi->dparms.props.max_pd = hfi1_max_pds;
1363        rdi->dparms.props.max_qp_rd_atom = HFI1_MAX_RDMA_ATOMIC;
1364        rdi->dparms.props.max_qp_init_rd_atom = 255;
1365        rdi->dparms.props.max_srq = hfi1_max_srqs;
1366        rdi->dparms.props.max_srq_wr = hfi1_max_srq_wrs;
1367        rdi->dparms.props.max_srq_sge = hfi1_max_srq_sges;
1368        rdi->dparms.props.atomic_cap = IB_ATOMIC_GLOB;
1369        rdi->dparms.props.max_pkeys = hfi1_get_npkeys(dd);
1370        rdi->dparms.props.max_mcast_grp = hfi1_max_mcast_grps;
1371        rdi->dparms.props.max_mcast_qp_attach = hfi1_max_mcast_qp_attached;
1372        rdi->dparms.props.max_total_mcast_qp_attach =
1373                                        rdi->dparms.props.max_mcast_qp_attach *
1374                                        rdi->dparms.props.max_mcast_grp;
1375}
1376
1377static inline u16 opa_speed_to_ib(u16 in)
1378{
1379        u16 out = 0;
1380
1381        if (in & OPA_LINK_SPEED_25G)
1382                out |= IB_SPEED_EDR;
1383        if (in & OPA_LINK_SPEED_12_5G)
1384                out |= IB_SPEED_FDR;
1385
1386        return out;
1387}
1388
1389/*
1390 * Convert a single OPA link width (no multiple flags) to an IB value.
1391 * A zero OPA link width means link down, which means the IB width value
1392 * is a don't care.
1393 */
1394static inline u16 opa_width_to_ib(u16 in)
1395{
1396        switch (in) {
1397        case OPA_LINK_WIDTH_1X:
1398        /* map 2x and 3x to 1x as they don't exist in IB */
1399        case OPA_LINK_WIDTH_2X:
1400        case OPA_LINK_WIDTH_3X:
1401                return IB_WIDTH_1X;
1402        default: /* link down or unknown, return our largest width */
1403        case OPA_LINK_WIDTH_4X:
1404                return IB_WIDTH_4X;
1405        }
1406}
1407
1408static int query_port(struct rvt_dev_info *rdi, u8 port_num,
1409                      struct ib_port_attr *props)
1410{
1411        struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
1412        struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
1413        struct hfi1_pportdata *ppd = &dd->pport[port_num - 1];
1414        u32 lid = ppd->lid;
1415
1416        /* props being zeroed by the caller, avoid zeroing it here */
1417        props->lid = lid ? lid : 0;
1418        props->lmc = ppd->lmc;
1419        /* OPA logical states match IB logical states */
1420        props->state = driver_lstate(ppd);
1421        props->phys_state = driver_pstate(ppd);
1422        props->gid_tbl_len = HFI1_GUIDS_PER_PORT;
1423        props->active_width = (u8)opa_width_to_ib(ppd->link_width_active);
1424        /* see rate_show() in ib core/sysfs.c */
1425        props->active_speed = (u8)opa_speed_to_ib(ppd->link_speed_active);
1426        props->max_vl_num = ppd->vls_supported;
1427
1428        /* Once we are a "first class" citizen and have added the OPA MTUs to
1429         * the core we can advertise the larger MTU enum to the ULPs, for now
1430         * advertise only 4K.
1431         *
1432         * Those applications which are either OPA aware or pass the MTU enum
1433         * from the Path Records to us will get the new 8k MTU.  Those that
1434         * attempt to process the MTU enum may fail in various ways.
1435         */
1436        props->max_mtu = mtu_to_enum((!valid_ib_mtu(hfi1_max_mtu) ?
1437                                      4096 : hfi1_max_mtu), IB_MTU_4096);
1438        props->active_mtu = !valid_ib_mtu(ppd->ibmtu) ? props->max_mtu :
1439                mtu_to_enum(ppd->ibmtu, IB_MTU_4096);
1440
1441        return 0;
1442}
1443
1444static int modify_device(struct ib_device *device,
1445                         int device_modify_mask,
1446                         struct ib_device_modify *device_modify)
1447{
1448        struct hfi1_devdata *dd = dd_from_ibdev(device);
1449        unsigned i;
1450        int ret;
1451
1452        if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID |
1453                                   IB_DEVICE_MODIFY_NODE_DESC)) {
1454                ret = -EOPNOTSUPP;
1455                goto bail;
1456        }
1457
1458        if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC) {
1459                memcpy(device->node_desc, device_modify->node_desc,
1460                       IB_DEVICE_NODE_DESC_MAX);
1461                for (i = 0; i < dd->num_pports; i++) {
1462                        struct hfi1_ibport *ibp = &dd->pport[i].ibport_data;
1463
1464                        hfi1_node_desc_chg(ibp);
1465                }
1466        }
1467
1468        if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) {
1469                ib_hfi1_sys_image_guid =
1470                        cpu_to_be64(device_modify->sys_image_guid);
1471                for (i = 0; i < dd->num_pports; i++) {
1472                        struct hfi1_ibport *ibp = &dd->pport[i].ibport_data;
1473
1474                        hfi1_sys_guid_chg(ibp);
1475                }
1476        }
1477
1478        ret = 0;
1479
1480bail:
1481        return ret;
1482}
1483
1484static int shut_down_port(struct rvt_dev_info *rdi, u8 port_num)
1485{
1486        struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
1487        struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
1488        struct hfi1_pportdata *ppd = &dd->pport[port_num - 1];
1489        int ret;
1490
1491        set_link_down_reason(ppd, OPA_LINKDOWN_REASON_UNKNOWN, 0,
1492                             OPA_LINKDOWN_REASON_UNKNOWN);
1493        ret = set_link_state(ppd, HLS_DN_DOWNDEF);
1494        return ret;
1495}
1496
1497static int hfi1_get_guid_be(struct rvt_dev_info *rdi, struct rvt_ibport *rvp,
1498                            int guid_index, __be64 *guid)
1499{
1500        struct hfi1_ibport *ibp = container_of(rvp, struct hfi1_ibport, rvp);
1501
1502        if (guid_index >= HFI1_GUIDS_PER_PORT)
1503                return -EINVAL;
1504
1505        *guid = get_sguid(ibp, guid_index);
1506        return 0;
1507}
1508
1509/*
1510 * convert ah port,sl to sc
1511 */
1512u8 ah_to_sc(struct ib_device *ibdev, struct rdma_ah_attr *ah)
1513{
1514        struct hfi1_ibport *ibp = to_iport(ibdev, rdma_ah_get_port_num(ah));
1515
1516        return ibp->sl_to_sc[rdma_ah_get_sl(ah)];
1517}
1518
1519static int hfi1_check_ah(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr)
1520{
1521        struct hfi1_ibport *ibp;
1522        struct hfi1_pportdata *ppd;
1523        struct hfi1_devdata *dd;
1524        u8 sc5;
1525        u8 sl;
1526
1527        if (hfi1_check_mcast(rdma_ah_get_dlid(ah_attr)) &&
1528            !(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH))
1529                return -EINVAL;
1530
1531        /* test the mapping for validity */
1532        ibp = to_iport(ibdev, rdma_ah_get_port_num(ah_attr));
1533        ppd = ppd_from_ibp(ibp);
1534        dd = dd_from_ppd(ppd);
1535
1536        sl = rdma_ah_get_sl(ah_attr);
1537        if (sl >= ARRAY_SIZE(ibp->sl_to_sc))
1538                return -EINVAL;
1539        sl = array_index_nospec(sl, ARRAY_SIZE(ibp->sl_to_sc));
1540
1541        sc5 = ibp->sl_to_sc[sl];
1542        if (sc_to_vlt(dd, sc5) > num_vls && sc_to_vlt(dd, sc5) != 0xf)
1543                return -EINVAL;
1544        return 0;
1545}
1546
1547static void hfi1_notify_new_ah(struct ib_device *ibdev,
1548                               struct rdma_ah_attr *ah_attr,
1549                               struct rvt_ah *ah)
1550{
1551        struct hfi1_ibport *ibp;
1552        struct hfi1_pportdata *ppd;
1553        struct hfi1_devdata *dd;
1554        u8 sc5;
1555        struct rdma_ah_attr *attr = &ah->attr;
1556
1557        /*
1558         * Do not trust reading anything from rvt_ah at this point as it is not
1559         * done being setup. We can however modify things which we need to set.
1560         */
1561
1562        ibp = to_iport(ibdev, rdma_ah_get_port_num(ah_attr));
1563        ppd = ppd_from_ibp(ibp);
1564        sc5 = ibp->sl_to_sc[rdma_ah_get_sl(&ah->attr)];
1565        hfi1_update_ah_attr(ibdev, attr);
1566        hfi1_make_opa_lid(attr);
1567        dd = dd_from_ppd(ppd);
1568        ah->vl = sc_to_vlt(dd, sc5);
1569        if (ah->vl < num_vls || ah->vl == 15)
1570                ah->log_pmtu = ilog2(dd->vld[ah->vl].mtu);
1571}
1572
1573/**
1574 * hfi1_get_npkeys - return the size of the PKEY table for context 0
1575 * @dd: the hfi1_ib device
1576 */
1577unsigned hfi1_get_npkeys(struct hfi1_devdata *dd)
1578{
1579        return ARRAY_SIZE(dd->pport[0].pkeys);
1580}
1581
1582static void init_ibport(struct hfi1_pportdata *ppd)
1583{
1584        struct hfi1_ibport *ibp = &ppd->ibport_data;
1585        size_t sz = ARRAY_SIZE(ibp->sl_to_sc);
1586        int i;
1587
1588        for (i = 0; i < sz; i++) {
1589                ibp->sl_to_sc[i] = i;
1590                ibp->sc_to_sl[i] = i;
1591        }
1592
1593        for (i = 0; i < RVT_MAX_TRAP_LISTS ; i++)
1594                INIT_LIST_HEAD(&ibp->rvp.trap_lists[i].list);
1595        timer_setup(&ibp->rvp.trap_timer, hfi1_handle_trap_timer, 0);
1596
1597        spin_lock_init(&ibp->rvp.lock);
1598        /* Set the prefix to the default value (see ch. 4.1.1) */
1599        ibp->rvp.gid_prefix = IB_DEFAULT_GID_PREFIX;
1600        ibp->rvp.sm_lid = 0;
1601        /*
1602         * Below should only set bits defined in OPA PortInfo.CapabilityMask
1603         * and PortInfo.CapabilityMask3
1604         */
1605        ibp->rvp.port_cap_flags = IB_PORT_AUTO_MIGR_SUP |
1606                IB_PORT_CAP_MASK_NOTICE_SUP;
1607        ibp->rvp.port_cap3_flags = OPA_CAP_MASK3_IsSharedSpaceSupported;
1608        ibp->rvp.pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA;
1609        ibp->rvp.pma_counter_select[1] = IB_PMA_PORT_RCV_DATA;
1610        ibp->rvp.pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS;
1611        ibp->rvp.pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS;
1612        ibp->rvp.pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT;
1613
1614        RCU_INIT_POINTER(ibp->rvp.qp[0], NULL);
1615        RCU_INIT_POINTER(ibp->rvp.qp[1], NULL);
1616}
1617
1618static void hfi1_get_dev_fw_str(struct ib_device *ibdev, char *str)
1619{
1620        struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
1621        struct hfi1_ibdev *dev = dev_from_rdi(rdi);
1622        u32 ver = dd_from_dev(dev)->dc8051_ver;
1623
1624        snprintf(str, IB_FW_VERSION_NAME_MAX, "%u.%u.%u", dc8051_ver_maj(ver),
1625                 dc8051_ver_min(ver), dc8051_ver_patch(ver));
1626}
1627
1628static const char * const driver_cntr_names[] = {
1629        /* must be element 0*/
1630        "DRIVER_KernIntr",
1631        "DRIVER_ErrorIntr",
1632        "DRIVER_Tx_Errs",
1633        "DRIVER_Rcv_Errs",
1634        "DRIVER_HW_Errs",
1635        "DRIVER_NoPIOBufs",
1636        "DRIVER_CtxtsOpen",
1637        "DRIVER_RcvLen_Errs",
1638        "DRIVER_EgrBufFull",
1639        "DRIVER_EgrHdrFull"
1640};
1641
1642static DEFINE_MUTEX(cntr_names_lock); /* protects the *_cntr_names bufers */
1643static const char **dev_cntr_names;
1644static const char **port_cntr_names;
1645int num_driver_cntrs = ARRAY_SIZE(driver_cntr_names);
1646static int num_dev_cntrs;
1647static int num_port_cntrs;
1648static int cntr_names_initialized;
1649
1650/*
1651 * Convert a list of names separated by '\n' into an array of NULL terminated
1652 * strings. Optionally some entries can be reserved in the array to hold extra
1653 * external strings.
1654 */
1655static int init_cntr_names(const char *names_in,
1656                           const size_t names_len,
1657                           int num_extra_names,
1658                           int *num_cntrs,
1659                           const char ***cntr_names)
1660{
1661        char *names_out, *p, **q;
1662        int i, n;
1663
1664        n = 0;
1665        for (i = 0; i < names_len; i++)
1666                if (names_in[i] == '\n')
1667                        n++;
1668
1669        names_out = kmalloc((n + num_extra_names) * sizeof(char *) + names_len,
1670                            GFP_KERNEL);
1671        if (!names_out) {
1672                *num_cntrs = 0;
1673                *cntr_names = NULL;
1674                return -ENOMEM;
1675        }
1676
1677        p = names_out + (n + num_extra_names) * sizeof(char *);
1678        memcpy(p, names_in, names_len);
1679
1680        q = (char **)names_out;
1681        for (i = 0; i < n; i++) {
1682                q[i] = p;
1683                p = strchr(p, '\n');
1684                *p++ = '\0';
1685        }
1686
1687        *num_cntrs = n;
1688        *cntr_names = (const char **)names_out;
1689        return 0;
1690}
1691
1692static struct rdma_hw_stats *alloc_hw_stats(struct ib_device *ibdev,
1693                                            u8 port_num)
1694{
1695        int i, err;
1696
1697        mutex_lock(&cntr_names_lock);
1698        if (!cntr_names_initialized) {
1699                struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
1700
1701                err = init_cntr_names(dd->cntrnames,
1702                                      dd->cntrnameslen,
1703                                      num_driver_cntrs,
1704                                      &num_dev_cntrs,
1705                                      &dev_cntr_names);
1706                if (err) {
1707                        mutex_unlock(&cntr_names_lock);
1708                        return NULL;
1709                }
1710
1711                for (i = 0; i < num_driver_cntrs; i++)
1712                        dev_cntr_names[num_dev_cntrs + i] =
1713                                driver_cntr_names[i];
1714
1715                err = init_cntr_names(dd->portcntrnames,
1716                                      dd->portcntrnameslen,
1717                                      0,
1718                                      &num_port_cntrs,
1719                                      &port_cntr_names);
1720                if (err) {
1721                        kfree(dev_cntr_names);
1722                        dev_cntr_names = NULL;
1723                        mutex_unlock(&cntr_names_lock);
1724                        return NULL;
1725                }
1726                cntr_names_initialized = 1;
1727        }
1728        mutex_unlock(&cntr_names_lock);
1729
1730        if (!port_num)
1731                return rdma_alloc_hw_stats_struct(
1732                                dev_cntr_names,
1733                                num_dev_cntrs + num_driver_cntrs,
1734                                RDMA_HW_STATS_DEFAULT_LIFESPAN);
1735        else
1736                return rdma_alloc_hw_stats_struct(
1737                                port_cntr_names,
1738                                num_port_cntrs,
1739                                RDMA_HW_STATS_DEFAULT_LIFESPAN);
1740}
1741
1742static u64 hfi1_sps_ints(void)
1743{
1744        unsigned long index, flags;
1745        struct hfi1_devdata *dd;
1746        u64 sps_ints = 0;
1747
1748        xa_lock_irqsave(&hfi1_dev_table, flags);
1749        xa_for_each(&hfi1_dev_table, index, dd) {
1750                sps_ints += get_all_cpu_total(dd->int_counter);
1751        }
1752        xa_unlock_irqrestore(&hfi1_dev_table, flags);
1753        return sps_ints;
1754}
1755
1756static int get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats,
1757                        u8 port, int index)
1758{
1759        u64 *values;
1760        int count;
1761
1762        if (!port) {
1763                u64 *stats = (u64 *)&hfi1_stats;
1764                int i;
1765
1766                hfi1_read_cntrs(dd_from_ibdev(ibdev), NULL, &values);
1767                values[num_dev_cntrs] = hfi1_sps_ints();
1768                for (i = 1; i < num_driver_cntrs; i++)
1769                        values[num_dev_cntrs + i] = stats[i];
1770                count = num_dev_cntrs + num_driver_cntrs;
1771        } else {
1772                struct hfi1_ibport *ibp = to_iport(ibdev, port);
1773
1774                hfi1_read_portcntrs(ppd_from_ibp(ibp), NULL, &values);
1775                count = num_port_cntrs;
1776        }
1777
1778        memcpy(stats->value, values, count * sizeof(u64));
1779        return count;
1780}
1781
1782static const struct ib_device_ops hfi1_dev_ops = {
1783        .owner = THIS_MODULE,
1784        .driver_id = RDMA_DRIVER_HFI1,
1785
1786        .alloc_hw_stats = alloc_hw_stats,
1787        .alloc_rdma_netdev = hfi1_vnic_alloc_rn,
1788        .get_dev_fw_str = hfi1_get_dev_fw_str,
1789        .get_hw_stats = get_hw_stats,
1790        .init_port = hfi1_create_port_files,
1791        .modify_device = modify_device,
1792        /* keep process mad in the driver */
1793        .process_mad = hfi1_process_mad,
1794};
1795
1796/**
1797 * hfi1_register_ib_device - register our device with the infiniband core
1798 * @dd: the device data structure
1799 * Return 0 if successful, errno if unsuccessful.
1800 */
1801int hfi1_register_ib_device(struct hfi1_devdata *dd)
1802{
1803        struct hfi1_ibdev *dev = &dd->verbs_dev;
1804        struct ib_device *ibdev = &dev->rdi.ibdev;
1805        struct hfi1_pportdata *ppd = dd->pport;
1806        struct hfi1_ibport *ibp = &ppd->ibport_data;
1807        unsigned i;
1808        int ret;
1809
1810        for (i = 0; i < dd->num_pports; i++)
1811                init_ibport(ppd + i);
1812
1813        /* Only need to initialize non-zero fields. */
1814
1815        timer_setup(&dev->mem_timer, mem_timer, 0);
1816
1817        seqlock_init(&dev->iowait_lock);
1818        seqlock_init(&dev->txwait_lock);
1819        INIT_LIST_HEAD(&dev->txwait);
1820        INIT_LIST_HEAD(&dev->memwait);
1821
1822        ret = verbs_txreq_init(dev);
1823        if (ret)
1824                goto err_verbs_txreq;
1825
1826        /* Use first-port GUID as node guid */
1827        ibdev->node_guid = get_sguid(ibp, HFI1_PORT_GUID_INDEX);
1828
1829        /*
1830         * The system image GUID is supposed to be the same for all
1831         * HFIs in a single system but since there can be other
1832         * device types in the system, we can't be sure this is unique.
1833         */
1834        if (!ib_hfi1_sys_image_guid)
1835                ib_hfi1_sys_image_guid = ibdev->node_guid;
1836        ibdev->phys_port_cnt = dd->num_pports;
1837        ibdev->dev.parent = &dd->pcidev->dev;
1838
1839        ib_set_device_ops(ibdev, &hfi1_dev_ops);
1840
1841        strlcpy(ibdev->node_desc, init_utsname()->nodename,
1842                sizeof(ibdev->node_desc));
1843
1844        /*
1845         * Fill in rvt info object.
1846         */
1847        dd->verbs_dev.rdi.driver_f.get_pci_dev = get_pci_dev;
1848        dd->verbs_dev.rdi.driver_f.check_ah = hfi1_check_ah;
1849        dd->verbs_dev.rdi.driver_f.notify_new_ah = hfi1_notify_new_ah;
1850        dd->verbs_dev.rdi.driver_f.get_guid_be = hfi1_get_guid_be;
1851        dd->verbs_dev.rdi.driver_f.query_port_state = query_port;
1852        dd->verbs_dev.rdi.driver_f.shut_down_port = shut_down_port;
1853        dd->verbs_dev.rdi.driver_f.cap_mask_chg = hfi1_cap_mask_chg;
1854        /*
1855         * Fill in rvt info device attributes.
1856         */
1857        hfi1_fill_device_attr(dd);
1858
1859        /* queue pair */
1860        dd->verbs_dev.rdi.dparms.qp_table_size = hfi1_qp_table_size;
1861        dd->verbs_dev.rdi.dparms.qpn_start = 0;
1862        dd->verbs_dev.rdi.dparms.qpn_inc = 1;
1863        dd->verbs_dev.rdi.dparms.qos_shift = dd->qos_shift;
1864        dd->verbs_dev.rdi.dparms.qpn_res_start = kdeth_qp << 16;
1865        dd->verbs_dev.rdi.dparms.qpn_res_end =
1866        dd->verbs_dev.rdi.dparms.qpn_res_start + 65535;
1867        dd->verbs_dev.rdi.dparms.max_rdma_atomic = HFI1_MAX_RDMA_ATOMIC;
1868        dd->verbs_dev.rdi.dparms.psn_mask = PSN_MASK;
1869        dd->verbs_dev.rdi.dparms.psn_shift = PSN_SHIFT;
1870        dd->verbs_dev.rdi.dparms.psn_modify_mask = PSN_MODIFY_MASK;
1871        dd->verbs_dev.rdi.dparms.core_cap_flags = RDMA_CORE_PORT_INTEL_OPA |
1872                                                RDMA_CORE_CAP_OPA_AH;
1873        dd->verbs_dev.rdi.dparms.max_mad_size = OPA_MGMT_MAD_SIZE;
1874
1875        dd->verbs_dev.rdi.driver_f.qp_priv_alloc = qp_priv_alloc;
1876        dd->verbs_dev.rdi.driver_f.qp_priv_init = hfi1_qp_priv_init;
1877        dd->verbs_dev.rdi.driver_f.qp_priv_free = qp_priv_free;
1878        dd->verbs_dev.rdi.driver_f.free_all_qps = free_all_qps;
1879        dd->verbs_dev.rdi.driver_f.notify_qp_reset = notify_qp_reset;
1880        dd->verbs_dev.rdi.driver_f.do_send = hfi1_do_send_from_rvt;
1881        dd->verbs_dev.rdi.driver_f.schedule_send = hfi1_schedule_send;
1882        dd->verbs_dev.rdi.driver_f.schedule_send_no_lock = _hfi1_schedule_send;
1883        dd->verbs_dev.rdi.driver_f.get_pmtu_from_attr = get_pmtu_from_attr;
1884        dd->verbs_dev.rdi.driver_f.notify_error_qp = notify_error_qp;
1885        dd->verbs_dev.rdi.driver_f.flush_qp_waiters = flush_qp_waiters;
1886        dd->verbs_dev.rdi.driver_f.stop_send_queue = stop_send_queue;
1887        dd->verbs_dev.rdi.driver_f.quiesce_qp = quiesce_qp;
1888        dd->verbs_dev.rdi.driver_f.notify_error_qp = notify_error_qp;
1889        dd->verbs_dev.rdi.driver_f.mtu_from_qp = mtu_from_qp;
1890        dd->verbs_dev.rdi.driver_f.mtu_to_path_mtu = mtu_to_path_mtu;
1891        dd->verbs_dev.rdi.driver_f.check_modify_qp = hfi1_check_modify_qp;
1892        dd->verbs_dev.rdi.driver_f.modify_qp = hfi1_modify_qp;
1893        dd->verbs_dev.rdi.driver_f.notify_restart_rc = hfi1_restart_rc;
1894        dd->verbs_dev.rdi.driver_f.setup_wqe = hfi1_setup_wqe;
1895        dd->verbs_dev.rdi.driver_f.comp_vect_cpu_lookup =
1896                                                hfi1_comp_vect_mappings_lookup;
1897
1898        /* completeion queue */
1899        dd->verbs_dev.rdi.ibdev.num_comp_vectors = dd->comp_vect_possible_cpus;
1900        dd->verbs_dev.rdi.dparms.node = dd->node;
1901
1902        /* misc settings */
1903        dd->verbs_dev.rdi.flags = 0; /* Let rdmavt handle it all */
1904        dd->verbs_dev.rdi.dparms.lkey_table_size = hfi1_lkey_table_size;
1905        dd->verbs_dev.rdi.dparms.nports = dd->num_pports;
1906        dd->verbs_dev.rdi.dparms.npkeys = hfi1_get_npkeys(dd);
1907        dd->verbs_dev.rdi.dparms.sge_copy_mode = sge_copy_mode;
1908        dd->verbs_dev.rdi.dparms.wss_threshold = wss_threshold;
1909        dd->verbs_dev.rdi.dparms.wss_clean_period = wss_clean_period;
1910        dd->verbs_dev.rdi.dparms.reserved_operations = 1;
1911        dd->verbs_dev.rdi.dparms.extra_rdma_atomic = HFI1_TID_RDMA_WRITE_CNT;
1912
1913        /* post send table */
1914        dd->verbs_dev.rdi.post_parms = hfi1_post_parms;
1915
1916        /* opcode translation table */
1917        dd->verbs_dev.rdi.wc_opcode = ib_hfi1_wc_opcode;
1918
1919        ppd = dd->pport;
1920        for (i = 0; i < dd->num_pports; i++, ppd++)
1921                rvt_init_port(&dd->verbs_dev.rdi,
1922                              &ppd->ibport_data.rvp,
1923                              i,
1924                              ppd->pkeys);
1925
1926        rdma_set_device_sysfs_group(&dd->verbs_dev.rdi.ibdev,
1927                                    &ib_hfi1_attr_group);
1928
1929        ret = rvt_register_device(&dd->verbs_dev.rdi);
1930        if (ret)
1931                goto err_verbs_txreq;
1932
1933        ret = hfi1_verbs_register_sysfs(dd);
1934        if (ret)
1935                goto err_class;
1936
1937        return ret;
1938
1939err_class:
1940        rvt_unregister_device(&dd->verbs_dev.rdi);
1941err_verbs_txreq:
1942        verbs_txreq_exit(dev);
1943        dd_dev_err(dd, "cannot register verbs: %d!\n", -ret);
1944        return ret;
1945}
1946
1947void hfi1_unregister_ib_device(struct hfi1_devdata *dd)
1948{
1949        struct hfi1_ibdev *dev = &dd->verbs_dev;
1950
1951        hfi1_verbs_unregister_sysfs(dd);
1952
1953        rvt_unregister_device(&dd->verbs_dev.rdi);
1954
1955        if (!list_empty(&dev->txwait))
1956                dd_dev_err(dd, "txwait list not empty!\n");
1957        if (!list_empty(&dev->memwait))
1958                dd_dev_err(dd, "memwait list not empty!\n");
1959
1960        del_timer_sync(&dev->mem_timer);
1961        verbs_txreq_exit(dev);
1962
1963        mutex_lock(&cntr_names_lock);
1964        kfree(dev_cntr_names);
1965        kfree(port_cntr_names);
1966        dev_cntr_names = NULL;
1967        port_cntr_names = NULL;
1968        cntr_names_initialized = 0;
1969        mutex_unlock(&cntr_names_lock);
1970}
1971
1972void hfi1_cnp_rcv(struct hfi1_packet *packet)
1973{
1974        struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd);
1975        struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
1976        struct ib_header *hdr = packet->hdr;
1977        struct rvt_qp *qp = packet->qp;
1978        u32 lqpn, rqpn = 0;
1979        u16 rlid = 0;
1980        u8 sl, sc5, svc_type;
1981
1982        switch (packet->qp->ibqp.qp_type) {
1983        case IB_QPT_UC:
1984                rlid = rdma_ah_get_dlid(&qp->remote_ah_attr);
1985                rqpn = qp->remote_qpn;
1986                svc_type = IB_CC_SVCTYPE_UC;
1987                break;
1988        case IB_QPT_RC:
1989                rlid = rdma_ah_get_dlid(&qp->remote_ah_attr);
1990                rqpn = qp->remote_qpn;
1991                svc_type = IB_CC_SVCTYPE_RC;
1992                break;
1993        case IB_QPT_SMI:
1994        case IB_QPT_GSI:
1995        case IB_QPT_UD:
1996                svc_type = IB_CC_SVCTYPE_UD;
1997                break;
1998        default:
1999                ibp->rvp.n_pkt_drops++;
2000                return;
2001        }
2002
2003        sc5 = hfi1_9B_get_sc5(hdr, packet->rhf);
2004        sl = ibp->sc_to_sl[sc5];
2005        lqpn = qp->ibqp.qp_num;
2006
2007        process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type);
2008}
2009