linux/drivers/infiniband/hw/hfi1/verbs.c
<<
>>
Prefs
   1/*
   2 * Copyright(c) 2015 - 2020 Intel Corporation.
   3 *
   4 * This file is provided under a dual BSD/GPLv2 license.  When using or
   5 * redistributing this file, you may do so under either license.
   6 *
   7 * GPL LICENSE SUMMARY
   8 *
   9 * This program is free software; you can redistribute it and/or modify
  10 * it under the terms of version 2 of the GNU General Public License as
  11 * published by the Free Software Foundation.
  12 *
  13 * This program is distributed in the hope that it will be useful, but
  14 * WITHOUT ANY WARRANTY; without even the implied warranty of
  15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16 * General Public License for more details.
  17 *
  18 * BSD LICENSE
  19 *
  20 * Redistribution and use in source and binary forms, with or without
  21 * modification, are permitted provided that the following conditions
  22 * are met:
  23 *
  24 *  - Redistributions of source code must retain the above copyright
  25 *    notice, this list of conditions and the following disclaimer.
  26 *  - Redistributions in binary form must reproduce the above copyright
  27 *    notice, this list of conditions and the following disclaimer in
  28 *    the documentation and/or other materials provided with the
  29 *    distribution.
  30 *  - Neither the name of Intel Corporation nor the names of its
  31 *    contributors may be used to endorse or promote products derived
  32 *    from this software without specific prior written permission.
  33 *
  34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  45 *
  46 */
  47
  48#include <rdma/ib_mad.h>
  49#include <rdma/ib_user_verbs.h>
  50#include <linux/io.h>
  51#include <linux/module.h>
  52#include <linux/utsname.h>
  53#include <linux/rculist.h>
  54#include <linux/mm.h>
  55#include <linux/vmalloc.h>
  56#include <rdma/opa_addr.h>
  57#include <linux/nospec.h>
  58
  59#include "hfi.h"
  60#include "common.h"
  61#include "device.h"
  62#include "trace.h"
  63#include "qp.h"
  64#include "verbs_txreq.h"
  65#include "debugfs.h"
  66#include "vnic.h"
  67#include "fault.h"
  68#include "affinity.h"
  69#include "ipoib.h"
  70
  71static unsigned int hfi1_lkey_table_size = 16;
  72module_param_named(lkey_table_size, hfi1_lkey_table_size, uint,
  73                   S_IRUGO);
  74MODULE_PARM_DESC(lkey_table_size,
  75                 "LKEY table size in bits (2^n, 1 <= n <= 23)");
  76
  77static unsigned int hfi1_max_pds = 0xFFFF;
  78module_param_named(max_pds, hfi1_max_pds, uint, S_IRUGO);
  79MODULE_PARM_DESC(max_pds,
  80                 "Maximum number of protection domains to support");
  81
  82static unsigned int hfi1_max_ahs = 0xFFFF;
  83module_param_named(max_ahs, hfi1_max_ahs, uint, S_IRUGO);
  84MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support");
  85
  86unsigned int hfi1_max_cqes = 0x2FFFFF;
  87module_param_named(max_cqes, hfi1_max_cqes, uint, S_IRUGO);
  88MODULE_PARM_DESC(max_cqes,
  89                 "Maximum number of completion queue entries to support");
  90
  91unsigned int hfi1_max_cqs = 0x1FFFF;
  92module_param_named(max_cqs, hfi1_max_cqs, uint, S_IRUGO);
  93MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support");
  94
  95unsigned int hfi1_max_qp_wrs = 0x3FFF;
  96module_param_named(max_qp_wrs, hfi1_max_qp_wrs, uint, S_IRUGO);
  97MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support");
  98
  99unsigned int hfi1_max_qps = 32768;
 100module_param_named(max_qps, hfi1_max_qps, uint, S_IRUGO);
 101MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support");
 102
 103unsigned int hfi1_max_sges = 0x60;
 104module_param_named(max_sges, hfi1_max_sges, uint, S_IRUGO);
 105MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support");
 106
 107unsigned int hfi1_max_mcast_grps = 16384;
 108module_param_named(max_mcast_grps, hfi1_max_mcast_grps, uint, S_IRUGO);
 109MODULE_PARM_DESC(max_mcast_grps,
 110                 "Maximum number of multicast groups to support");
 111
 112unsigned int hfi1_max_mcast_qp_attached = 16;
 113module_param_named(max_mcast_qp_attached, hfi1_max_mcast_qp_attached,
 114                   uint, S_IRUGO);
 115MODULE_PARM_DESC(max_mcast_qp_attached,
 116                 "Maximum number of attached QPs to support");
 117
 118unsigned int hfi1_max_srqs = 1024;
 119module_param_named(max_srqs, hfi1_max_srqs, uint, S_IRUGO);
 120MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support");
 121
 122unsigned int hfi1_max_srq_sges = 128;
 123module_param_named(max_srq_sges, hfi1_max_srq_sges, uint, S_IRUGO);
 124MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support");
 125
 126unsigned int hfi1_max_srq_wrs = 0x1FFFF;
 127module_param_named(max_srq_wrs, hfi1_max_srq_wrs, uint, S_IRUGO);
 128MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support");
 129
 130unsigned short piothreshold = 256;
 131module_param(piothreshold, ushort, S_IRUGO);
 132MODULE_PARM_DESC(piothreshold, "size used to determine sdma vs. pio");
 133
 134static unsigned int sge_copy_mode;
 135module_param(sge_copy_mode, uint, S_IRUGO);
 136MODULE_PARM_DESC(sge_copy_mode,
 137                 "Verbs copy mode: 0 use memcpy, 1 use cacheless copy, 2 adapt based on WSS");
 138
 139static void verbs_sdma_complete(
 140        struct sdma_txreq *cookie,
 141        int status);
 142
 143static int pio_wait(struct rvt_qp *qp,
 144                    struct send_context *sc,
 145                    struct hfi1_pkt_state *ps,
 146                    u32 flag);
 147
 148/* Length of buffer to create verbs txreq cache name */
 149#define TXREQ_NAME_LEN 24
 150
 151static uint wss_threshold = 80;
 152module_param(wss_threshold, uint, S_IRUGO);
 153MODULE_PARM_DESC(wss_threshold, "Percentage (1-100) of LLC to use as a threshold for a cacheless copy");
 154static uint wss_clean_period = 256;
 155module_param(wss_clean_period, uint, S_IRUGO);
 156MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the page copy table is cleaned");
 157
 158/*
 159 * Translate ib_wr_opcode into ib_wc_opcode.
 160 */
 161const enum ib_wc_opcode ib_hfi1_wc_opcode[] = {
 162        [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
 163        [IB_WR_TID_RDMA_WRITE] = IB_WC_RDMA_WRITE,
 164        [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
 165        [IB_WR_SEND] = IB_WC_SEND,
 166        [IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
 167        [IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
 168        [IB_WR_TID_RDMA_READ] = IB_WC_RDMA_READ,
 169        [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
 170        [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD,
 171        [IB_WR_SEND_WITH_INV] = IB_WC_SEND,
 172        [IB_WR_LOCAL_INV] = IB_WC_LOCAL_INV,
 173        [IB_WR_REG_MR] = IB_WC_REG_MR
 174};
 175
 176/*
 177 * Length of header by opcode, 0 --> not supported
 178 */
 179const u8 hdr_len_by_opcode[256] = {
 180        /* RC */
 181        [IB_OPCODE_RC_SEND_FIRST]                     = 12 + 8,
 182        [IB_OPCODE_RC_SEND_MIDDLE]                    = 12 + 8,
 183        [IB_OPCODE_RC_SEND_LAST]                      = 12 + 8,
 184        [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE]       = 12 + 8 + 4,
 185        [IB_OPCODE_RC_SEND_ONLY]                      = 12 + 8,
 186        [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 4,
 187        [IB_OPCODE_RC_RDMA_WRITE_FIRST]               = 12 + 8 + 16,
 188        [IB_OPCODE_RC_RDMA_WRITE_MIDDLE]              = 12 + 8,
 189        [IB_OPCODE_RC_RDMA_WRITE_LAST]                = 12 + 8,
 190        [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
 191        [IB_OPCODE_RC_RDMA_WRITE_ONLY]                = 12 + 8 + 16,
 192        [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20,
 193        [IB_OPCODE_RC_RDMA_READ_REQUEST]              = 12 + 8 + 16,
 194        [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST]       = 12 + 8 + 4,
 195        [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE]      = 12 + 8,
 196        [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST]        = 12 + 8 + 4,
 197        [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY]        = 12 + 8 + 4,
 198        [IB_OPCODE_RC_ACKNOWLEDGE]                    = 12 + 8 + 4,
 199        [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE]             = 12 + 8 + 4 + 8,
 200        [IB_OPCODE_RC_COMPARE_SWAP]                   = 12 + 8 + 28,
 201        [IB_OPCODE_RC_FETCH_ADD]                      = 12 + 8 + 28,
 202        [IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE]      = 12 + 8 + 4,
 203        [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE]      = 12 + 8 + 4,
 204        [IB_OPCODE_TID_RDMA_READ_REQ]                 = 12 + 8 + 36,
 205        [IB_OPCODE_TID_RDMA_READ_RESP]                = 12 + 8 + 36,
 206        [IB_OPCODE_TID_RDMA_WRITE_REQ]                = 12 + 8 + 36,
 207        [IB_OPCODE_TID_RDMA_WRITE_RESP]               = 12 + 8 + 36,
 208        [IB_OPCODE_TID_RDMA_WRITE_DATA]               = 12 + 8 + 36,
 209        [IB_OPCODE_TID_RDMA_WRITE_DATA_LAST]          = 12 + 8 + 36,
 210        [IB_OPCODE_TID_RDMA_ACK]                      = 12 + 8 + 36,
 211        [IB_OPCODE_TID_RDMA_RESYNC]                   = 12 + 8 + 36,
 212        /* UC */
 213        [IB_OPCODE_UC_SEND_FIRST]                     = 12 + 8,
 214        [IB_OPCODE_UC_SEND_MIDDLE]                    = 12 + 8,
 215        [IB_OPCODE_UC_SEND_LAST]                      = 12 + 8,
 216        [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE]       = 12 + 8 + 4,
 217        [IB_OPCODE_UC_SEND_ONLY]                      = 12 + 8,
 218        [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 4,
 219        [IB_OPCODE_UC_RDMA_WRITE_FIRST]               = 12 + 8 + 16,
 220        [IB_OPCODE_UC_RDMA_WRITE_MIDDLE]              = 12 + 8,
 221        [IB_OPCODE_UC_RDMA_WRITE_LAST]                = 12 + 8,
 222        [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4,
 223        [IB_OPCODE_UC_RDMA_WRITE_ONLY]                = 12 + 8 + 16,
 224        [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20,
 225        /* UD */
 226        [IB_OPCODE_UD_SEND_ONLY]                      = 12 + 8 + 8,
 227        [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE]       = 12 + 8 + 12
 228};
 229
 230static const opcode_handler opcode_handler_tbl[256] = {
 231        /* RC */
 232        [IB_OPCODE_RC_SEND_FIRST]                     = &hfi1_rc_rcv,
 233        [IB_OPCODE_RC_SEND_MIDDLE]                    = &hfi1_rc_rcv,
 234        [IB_OPCODE_RC_SEND_LAST]                      = &hfi1_rc_rcv,
 235        [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE]       = &hfi1_rc_rcv,
 236        [IB_OPCODE_RC_SEND_ONLY]                      = &hfi1_rc_rcv,
 237        [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_rc_rcv,
 238        [IB_OPCODE_RC_RDMA_WRITE_FIRST]               = &hfi1_rc_rcv,
 239        [IB_OPCODE_RC_RDMA_WRITE_MIDDLE]              = &hfi1_rc_rcv,
 240        [IB_OPCODE_RC_RDMA_WRITE_LAST]                = &hfi1_rc_rcv,
 241        [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_rc_rcv,
 242        [IB_OPCODE_RC_RDMA_WRITE_ONLY]                = &hfi1_rc_rcv,
 243        [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_rc_rcv,
 244        [IB_OPCODE_RC_RDMA_READ_REQUEST]              = &hfi1_rc_rcv,
 245        [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST]       = &hfi1_rc_rcv,
 246        [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE]      = &hfi1_rc_rcv,
 247        [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST]        = &hfi1_rc_rcv,
 248        [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY]        = &hfi1_rc_rcv,
 249        [IB_OPCODE_RC_ACKNOWLEDGE]                    = &hfi1_rc_rcv,
 250        [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE]             = &hfi1_rc_rcv,
 251        [IB_OPCODE_RC_COMPARE_SWAP]                   = &hfi1_rc_rcv,
 252        [IB_OPCODE_RC_FETCH_ADD]                      = &hfi1_rc_rcv,
 253        [IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE]      = &hfi1_rc_rcv,
 254        [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE]      = &hfi1_rc_rcv,
 255
 256        /* TID RDMA has separate handlers for different opcodes.*/
 257        [IB_OPCODE_TID_RDMA_WRITE_REQ]       = &hfi1_rc_rcv_tid_rdma_write_req,
 258        [IB_OPCODE_TID_RDMA_WRITE_RESP]      = &hfi1_rc_rcv_tid_rdma_write_resp,
 259        [IB_OPCODE_TID_RDMA_WRITE_DATA]      = &hfi1_rc_rcv_tid_rdma_write_data,
 260        [IB_OPCODE_TID_RDMA_WRITE_DATA_LAST] = &hfi1_rc_rcv_tid_rdma_write_data,
 261        [IB_OPCODE_TID_RDMA_READ_REQ]        = &hfi1_rc_rcv_tid_rdma_read_req,
 262        [IB_OPCODE_TID_RDMA_READ_RESP]       = &hfi1_rc_rcv_tid_rdma_read_resp,
 263        [IB_OPCODE_TID_RDMA_RESYNC]          = &hfi1_rc_rcv_tid_rdma_resync,
 264        [IB_OPCODE_TID_RDMA_ACK]             = &hfi1_rc_rcv_tid_rdma_ack,
 265
 266        /* UC */
 267        [IB_OPCODE_UC_SEND_FIRST]                     = &hfi1_uc_rcv,
 268        [IB_OPCODE_UC_SEND_MIDDLE]                    = &hfi1_uc_rcv,
 269        [IB_OPCODE_UC_SEND_LAST]                      = &hfi1_uc_rcv,
 270        [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE]       = &hfi1_uc_rcv,
 271        [IB_OPCODE_UC_SEND_ONLY]                      = &hfi1_uc_rcv,
 272        [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_uc_rcv,
 273        [IB_OPCODE_UC_RDMA_WRITE_FIRST]               = &hfi1_uc_rcv,
 274        [IB_OPCODE_UC_RDMA_WRITE_MIDDLE]              = &hfi1_uc_rcv,
 275        [IB_OPCODE_UC_RDMA_WRITE_LAST]                = &hfi1_uc_rcv,
 276        [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_uc_rcv,
 277        [IB_OPCODE_UC_RDMA_WRITE_ONLY]                = &hfi1_uc_rcv,
 278        [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_uc_rcv,
 279        /* UD */
 280        [IB_OPCODE_UD_SEND_ONLY]                      = &hfi1_ud_rcv,
 281        [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE]       = &hfi1_ud_rcv,
 282        /* CNP */
 283        [IB_OPCODE_CNP]                               = &hfi1_cnp_rcv
 284};
 285
 286#define OPMASK 0x1f
 287
 288static const u32 pio_opmask[BIT(3)] = {
 289        /* RC */
 290        [IB_OPCODE_RC >> 5] =
 291                BIT(RC_OP(SEND_ONLY) & OPMASK) |
 292                BIT(RC_OP(SEND_ONLY_WITH_IMMEDIATE) & OPMASK) |
 293                BIT(RC_OP(RDMA_WRITE_ONLY) & OPMASK) |
 294                BIT(RC_OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE) & OPMASK) |
 295                BIT(RC_OP(RDMA_READ_REQUEST) & OPMASK) |
 296                BIT(RC_OP(ACKNOWLEDGE) & OPMASK) |
 297                BIT(RC_OP(ATOMIC_ACKNOWLEDGE) & OPMASK) |
 298                BIT(RC_OP(COMPARE_SWAP) & OPMASK) |
 299                BIT(RC_OP(FETCH_ADD) & OPMASK),
 300        /* UC */
 301        [IB_OPCODE_UC >> 5] =
 302                BIT(UC_OP(SEND_ONLY) & OPMASK) |
 303                BIT(UC_OP(SEND_ONLY_WITH_IMMEDIATE) & OPMASK) |
 304                BIT(UC_OP(RDMA_WRITE_ONLY) & OPMASK) |
 305                BIT(UC_OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE) & OPMASK),
 306};
 307
 308/*
 309 * System image GUID.
 310 */
 311__be64 ib_hfi1_sys_image_guid;
 312
 313/*
 314 * Make sure the QP is ready and able to accept the given opcode.
 315 */
 316static inline opcode_handler qp_ok(struct hfi1_packet *packet)
 317{
 318        if (!(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK))
 319                return NULL;
 320        if (((packet->opcode & RVT_OPCODE_QP_MASK) ==
 321             packet->qp->allowed_ops) ||
 322            (packet->opcode == IB_OPCODE_CNP))
 323                return opcode_handler_tbl[packet->opcode];
 324
 325        return NULL;
 326}
 327
 328static u64 hfi1_fault_tx(struct rvt_qp *qp, u8 opcode, u64 pbc)
 329{
 330#ifdef CONFIG_FAULT_INJECTION
 331        if ((opcode & IB_OPCODE_MSP) == IB_OPCODE_MSP) {
 332                /*
 333                 * In order to drop non-IB traffic we
 334                 * set PbcInsertHrc to NONE (0x2).
 335                 * The packet will still be delivered
 336                 * to the receiving node but a
 337                 * KHdrHCRCErr (KDETH packet with a bad
 338                 * HCRC) will be triggered and the
 339                 * packet will not be delivered to the
 340                 * correct context.
 341                 */
 342                pbc &= ~PBC_INSERT_HCRC_SMASK;
 343                pbc |= (u64)PBC_IHCRC_NONE << PBC_INSERT_HCRC_SHIFT;
 344        } else {
 345                /*
 346                 * In order to drop regular verbs
 347                 * traffic we set the PbcTestEbp
 348                 * flag. The packet will still be
 349                 * delivered to the receiving node but
 350                 * a 'late ebp error' will be
 351                 * triggered and will be dropped.
 352                 */
 353                pbc |= PBC_TEST_EBP;
 354        }
 355#endif
 356        return pbc;
 357}
 358
 359static opcode_handler tid_qp_ok(int opcode, struct hfi1_packet *packet)
 360{
 361        if (packet->qp->ibqp.qp_type != IB_QPT_RC ||
 362            !(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK))
 363                return NULL;
 364        if ((opcode & RVT_OPCODE_QP_MASK) == IB_OPCODE_TID_RDMA)
 365                return opcode_handler_tbl[opcode];
 366        return NULL;
 367}
 368
 369void hfi1_kdeth_eager_rcv(struct hfi1_packet *packet)
 370{
 371        struct hfi1_ctxtdata *rcd = packet->rcd;
 372        struct ib_header *hdr = packet->hdr;
 373        u32 tlen = packet->tlen;
 374        struct hfi1_pportdata *ppd = rcd->ppd;
 375        struct hfi1_ibport *ibp = &ppd->ibport_data;
 376        struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi;
 377        opcode_handler opcode_handler;
 378        unsigned long flags;
 379        u32 qp_num;
 380        int lnh;
 381        u8 opcode;
 382
 383        /* DW == LRH (2) + BTH (3) + KDETH (9) + CRC (1) */
 384        if (unlikely(tlen < 15 * sizeof(u32)))
 385                goto drop;
 386
 387        lnh = be16_to_cpu(hdr->lrh[0]) & 3;
 388        if (lnh != HFI1_LRH_BTH)
 389                goto drop;
 390
 391        packet->ohdr = &hdr->u.oth;
 392        trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf)));
 393
 394        opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24);
 395        inc_opstats(tlen, &rcd->opstats->stats[opcode]);
 396
 397        /* verbs_qp can be picked up from any tid_rdma header struct */
 398        qp_num = be32_to_cpu(packet->ohdr->u.tid_rdma.r_req.verbs_qp) &
 399                RVT_QPN_MASK;
 400
 401        rcu_read_lock();
 402        packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
 403        if (!packet->qp)
 404                goto drop_rcu;
 405        spin_lock_irqsave(&packet->qp->r_lock, flags);
 406        opcode_handler = tid_qp_ok(opcode, packet);
 407        if (likely(opcode_handler))
 408                opcode_handler(packet);
 409        else
 410                goto drop_unlock;
 411        spin_unlock_irqrestore(&packet->qp->r_lock, flags);
 412        rcu_read_unlock();
 413
 414        return;
 415drop_unlock:
 416        spin_unlock_irqrestore(&packet->qp->r_lock, flags);
 417drop_rcu:
 418        rcu_read_unlock();
 419drop:
 420        ibp->rvp.n_pkt_drops++;
 421}
 422
 423void hfi1_kdeth_expected_rcv(struct hfi1_packet *packet)
 424{
 425        struct hfi1_ctxtdata *rcd = packet->rcd;
 426        struct ib_header *hdr = packet->hdr;
 427        u32 tlen = packet->tlen;
 428        struct hfi1_pportdata *ppd = rcd->ppd;
 429        struct hfi1_ibport *ibp = &ppd->ibport_data;
 430        struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi;
 431        opcode_handler opcode_handler;
 432        unsigned long flags;
 433        u32 qp_num;
 434        int lnh;
 435        u8 opcode;
 436
 437        /* DW == LRH (2) + BTH (3) + KDETH (9) + CRC (1) */
 438        if (unlikely(tlen < 15 * sizeof(u32)))
 439                goto drop;
 440
 441        lnh = be16_to_cpu(hdr->lrh[0]) & 3;
 442        if (lnh != HFI1_LRH_BTH)
 443                goto drop;
 444
 445        packet->ohdr = &hdr->u.oth;
 446        trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf)));
 447
 448        opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24);
 449        inc_opstats(tlen, &rcd->opstats->stats[opcode]);
 450
 451        /* verbs_qp can be picked up from any tid_rdma header struct */
 452        qp_num = be32_to_cpu(packet->ohdr->u.tid_rdma.r_rsp.verbs_qp) &
 453                RVT_QPN_MASK;
 454
 455        rcu_read_lock();
 456        packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
 457        if (!packet->qp)
 458                goto drop_rcu;
 459        spin_lock_irqsave(&packet->qp->r_lock, flags);
 460        opcode_handler = tid_qp_ok(opcode, packet);
 461        if (likely(opcode_handler))
 462                opcode_handler(packet);
 463        else
 464                goto drop_unlock;
 465        spin_unlock_irqrestore(&packet->qp->r_lock, flags);
 466        rcu_read_unlock();
 467
 468        return;
 469drop_unlock:
 470        spin_unlock_irqrestore(&packet->qp->r_lock, flags);
 471drop_rcu:
 472        rcu_read_unlock();
 473drop:
 474        ibp->rvp.n_pkt_drops++;
 475}
 476
 477static int hfi1_do_pkey_check(struct hfi1_packet *packet)
 478{
 479        struct hfi1_ctxtdata *rcd = packet->rcd;
 480        struct hfi1_pportdata *ppd = rcd->ppd;
 481        struct hfi1_16b_header *hdr = packet->hdr;
 482        u16 pkey;
 483
 484        /* Pkey check needed only for bypass packets */
 485        if (packet->etype != RHF_RCV_TYPE_BYPASS)
 486                return 0;
 487
 488        /* Perform pkey check */
 489        pkey = hfi1_16B_get_pkey(hdr);
 490        return ingress_pkey_check(ppd, pkey, packet->sc,
 491                                  packet->qp->s_pkey_index,
 492                                  packet->slid, true);
 493}
 494
 495static inline void hfi1_handle_packet(struct hfi1_packet *packet,
 496                                      bool is_mcast)
 497{
 498        u32 qp_num;
 499        struct hfi1_ctxtdata *rcd = packet->rcd;
 500        struct hfi1_pportdata *ppd = rcd->ppd;
 501        struct hfi1_ibport *ibp = rcd_to_iport(rcd);
 502        struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi;
 503        opcode_handler packet_handler;
 504        unsigned long flags;
 505
 506        inc_opstats(packet->tlen, &rcd->opstats->stats[packet->opcode]);
 507
 508        if (unlikely(is_mcast)) {
 509                struct rvt_mcast *mcast;
 510                struct rvt_mcast_qp *p;
 511
 512                if (!packet->grh)
 513                        goto drop;
 514                mcast = rvt_mcast_find(&ibp->rvp,
 515                                       &packet->grh->dgid,
 516                                       opa_get_lid(packet->dlid, 9B));
 517                if (!mcast)
 518                        goto drop;
 519                rcu_read_lock();
 520                list_for_each_entry_rcu(p, &mcast->qp_list, list) {
 521                        packet->qp = p->qp;
 522                        if (hfi1_do_pkey_check(packet))
 523                                goto unlock_drop;
 524                        spin_lock_irqsave(&packet->qp->r_lock, flags);
 525                        packet_handler = qp_ok(packet);
 526                        if (likely(packet_handler))
 527                                packet_handler(packet);
 528                        else
 529                                ibp->rvp.n_pkt_drops++;
 530                        spin_unlock_irqrestore(&packet->qp->r_lock, flags);
 531                }
 532                rcu_read_unlock();
 533                /*
 534                 * Notify rvt_multicast_detach() if it is waiting for us
 535                 * to finish.
 536                 */
 537                if (atomic_dec_return(&mcast->refcount) <= 1)
 538                        wake_up(&mcast->wait);
 539        } else {
 540                /* Get the destination QP number. */
 541                if (packet->etype == RHF_RCV_TYPE_BYPASS &&
 542                    hfi1_16B_get_l4(packet->hdr) == OPA_16B_L4_FM)
 543                        qp_num = hfi1_16B_get_dest_qpn(packet->mgmt);
 544                else
 545                        qp_num = ib_bth_get_qpn(packet->ohdr);
 546
 547                rcu_read_lock();
 548                packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
 549                if (!packet->qp)
 550                        goto unlock_drop;
 551
 552                if (hfi1_do_pkey_check(packet))
 553                        goto unlock_drop;
 554
 555                spin_lock_irqsave(&packet->qp->r_lock, flags);
 556                packet_handler = qp_ok(packet);
 557                if (likely(packet_handler))
 558                        packet_handler(packet);
 559                else
 560                        ibp->rvp.n_pkt_drops++;
 561                spin_unlock_irqrestore(&packet->qp->r_lock, flags);
 562                rcu_read_unlock();
 563        }
 564        return;
 565unlock_drop:
 566        rcu_read_unlock();
 567drop:
 568        ibp->rvp.n_pkt_drops++;
 569}
 570
 571/**
 572 * hfi1_ib_rcv - process an incoming packet
 573 * @packet: data packet information
 574 *
 575 * This is called to process an incoming packet at interrupt level.
 576 */
 577void hfi1_ib_rcv(struct hfi1_packet *packet)
 578{
 579        struct hfi1_ctxtdata *rcd = packet->rcd;
 580
 581        trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf)));
 582        hfi1_handle_packet(packet, hfi1_check_mcast(packet->dlid));
 583}
 584
 585void hfi1_16B_rcv(struct hfi1_packet *packet)
 586{
 587        struct hfi1_ctxtdata *rcd = packet->rcd;
 588
 589        trace_input_ibhdr(rcd->dd, packet, false);
 590        hfi1_handle_packet(packet, hfi1_check_mcast(packet->dlid));
 591}
 592
 593/*
 594 * This is called from a timer to check for QPs
 595 * which need kernel memory in order to send a packet.
 596 */
 597static void mem_timer(struct timer_list *t)
 598{
 599        struct hfi1_ibdev *dev = from_timer(dev, t, mem_timer);
 600        struct list_head *list = &dev->memwait;
 601        struct rvt_qp *qp = NULL;
 602        struct iowait *wait;
 603        unsigned long flags;
 604        struct hfi1_qp_priv *priv;
 605
 606        write_seqlock_irqsave(&dev->iowait_lock, flags);
 607        if (!list_empty(list)) {
 608                wait = list_first_entry(list, struct iowait, list);
 609                qp = iowait_to_qp(wait);
 610                priv = qp->priv;
 611                list_del_init(&priv->s_iowait.list);
 612                priv->s_iowait.lock = NULL;
 613                /* refcount held until actual wake up */
 614                if (!list_empty(list))
 615                        mod_timer(&dev->mem_timer, jiffies + 1);
 616        }
 617        write_sequnlock_irqrestore(&dev->iowait_lock, flags);
 618
 619        if (qp)
 620                hfi1_qp_wakeup(qp, RVT_S_WAIT_KMEM);
 621}
 622
 623/*
 624 * This is called with progress side lock held.
 625 */
 626/* New API */
 627static void verbs_sdma_complete(
 628        struct sdma_txreq *cookie,
 629        int status)
 630{
 631        struct verbs_txreq *tx =
 632                container_of(cookie, struct verbs_txreq, txreq);
 633        struct rvt_qp *qp = tx->qp;
 634
 635        spin_lock(&qp->s_lock);
 636        if (tx->wqe) {
 637                rvt_send_complete(qp, tx->wqe, IB_WC_SUCCESS);
 638        } else if (qp->ibqp.qp_type == IB_QPT_RC) {
 639                struct hfi1_opa_header *hdr;
 640
 641                hdr = &tx->phdr.hdr;
 642                if (unlikely(status == SDMA_TXREQ_S_ABORTED))
 643                        hfi1_rc_verbs_aborted(qp, hdr);
 644                hfi1_rc_send_complete(qp, hdr);
 645        }
 646        spin_unlock(&qp->s_lock);
 647
 648        hfi1_put_txreq(tx);
 649}
 650
 651void hfi1_wait_kmem(struct rvt_qp *qp)
 652{
 653        struct hfi1_qp_priv *priv = qp->priv;
 654        struct ib_qp *ibqp = &qp->ibqp;
 655        struct ib_device *ibdev = ibqp->device;
 656        struct hfi1_ibdev *dev = to_idev(ibdev);
 657
 658        if (list_empty(&priv->s_iowait.list)) {
 659                if (list_empty(&dev->memwait))
 660                        mod_timer(&dev->mem_timer, jiffies + 1);
 661                qp->s_flags |= RVT_S_WAIT_KMEM;
 662                list_add_tail(&priv->s_iowait.list, &dev->memwait);
 663                priv->s_iowait.lock = &dev->iowait_lock;
 664                trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM);
 665                rvt_get_qp(qp);
 666        }
 667}
 668
 669static int wait_kmem(struct hfi1_ibdev *dev,
 670                     struct rvt_qp *qp,
 671                     struct hfi1_pkt_state *ps)
 672{
 673        unsigned long flags;
 674        int ret = 0;
 675
 676        spin_lock_irqsave(&qp->s_lock, flags);
 677        if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
 678                write_seqlock(&dev->iowait_lock);
 679                list_add_tail(&ps->s_txreq->txreq.list,
 680                              &ps->wait->tx_head);
 681                hfi1_wait_kmem(qp);
 682                write_sequnlock(&dev->iowait_lock);
 683                hfi1_qp_unbusy(qp, ps->wait);
 684                ret = -EBUSY;
 685        }
 686        spin_unlock_irqrestore(&qp->s_lock, flags);
 687
 688        return ret;
 689}
 690
 691/*
 692 * This routine calls txadds for each sg entry.
 693 *
 694 * Add failures will revert the sge cursor
 695 */
 696static noinline int build_verbs_ulp_payload(
 697        struct sdma_engine *sde,
 698        u32 length,
 699        struct verbs_txreq *tx)
 700{
 701        struct rvt_sge_state *ss = tx->ss;
 702        struct rvt_sge *sg_list = ss->sg_list;
 703        struct rvt_sge sge = ss->sge;
 704        u8 num_sge = ss->num_sge;
 705        u32 len;
 706        int ret = 0;
 707
 708        while (length) {
 709                len = rvt_get_sge_length(&ss->sge, length);
 710                WARN_ON_ONCE(len == 0);
 711                ret = sdma_txadd_kvaddr(
 712                        sde->dd,
 713                        &tx->txreq,
 714                        ss->sge.vaddr,
 715                        len);
 716                if (ret)
 717                        goto bail_txadd;
 718                rvt_update_sge(ss, len, false);
 719                length -= len;
 720        }
 721        return ret;
 722bail_txadd:
 723        /* unwind cursor */
 724        ss->sge = sge;
 725        ss->num_sge = num_sge;
 726        ss->sg_list = sg_list;
 727        return ret;
 728}
 729
 730/**
 731 * update_tx_opstats - record stats by opcode
 732 * @qp: the qp
 733 * @ps: transmit packet state
 734 * @plen: the plen in dwords
 735 *
 736 * This is a routine to record the tx opstats after a
 737 * packet has been presented to the egress mechanism.
 738 */
 739static void update_tx_opstats(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 740                              u32 plen)
 741{
 742#ifdef CONFIG_DEBUG_FS
 743        struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
 744        struct hfi1_opcode_stats_perctx *s = get_cpu_ptr(dd->tx_opstats);
 745
 746        inc_opstats(plen * 4, &s->stats[ps->opcode]);
 747        put_cpu_ptr(s);
 748#endif
 749}
 750
 751/*
 752 * Build the number of DMA descriptors needed to send length bytes of data.
 753 *
 754 * NOTE: DMA mapping is held in the tx until completed in the ring or
 755 *       the tx desc is freed without having been submitted to the ring
 756 *
 757 * This routine ensures all the helper routine calls succeed.
 758 */
 759/* New API */
 760static int build_verbs_tx_desc(
 761        struct sdma_engine *sde,
 762        u32 length,
 763        struct verbs_txreq *tx,
 764        struct hfi1_ahg_info *ahg_info,
 765        u64 pbc)
 766{
 767        int ret = 0;
 768        struct hfi1_sdma_header *phdr = &tx->phdr;
 769        u16 hdrbytes = (tx->hdr_dwords + sizeof(pbc) / 4) << 2;
 770        u8 extra_bytes = 0;
 771
 772        if (tx->phdr.hdr.hdr_type) {
 773                /*
 774                 * hdrbytes accounts for PBC. Need to subtract 8 bytes
 775                 * before calculating padding.
 776                 */
 777                extra_bytes = hfi1_get_16b_padding(hdrbytes - 8, length) +
 778                              (SIZE_OF_CRC << 2) + SIZE_OF_LT;
 779        }
 780        if (!ahg_info->ahgcount) {
 781                ret = sdma_txinit_ahg(
 782                        &tx->txreq,
 783                        ahg_info->tx_flags,
 784                        hdrbytes + length +
 785                        extra_bytes,
 786                        ahg_info->ahgidx,
 787                        0,
 788                        NULL,
 789                        0,
 790                        verbs_sdma_complete);
 791                if (ret)
 792                        goto bail_txadd;
 793                phdr->pbc = cpu_to_le64(pbc);
 794                ret = sdma_txadd_kvaddr(
 795                        sde->dd,
 796                        &tx->txreq,
 797                        phdr,
 798                        hdrbytes);
 799                if (ret)
 800                        goto bail_txadd;
 801        } else {
 802                ret = sdma_txinit_ahg(
 803                        &tx->txreq,
 804                        ahg_info->tx_flags,
 805                        length,
 806                        ahg_info->ahgidx,
 807                        ahg_info->ahgcount,
 808                        ahg_info->ahgdesc,
 809                        hdrbytes,
 810                        verbs_sdma_complete);
 811                if (ret)
 812                        goto bail_txadd;
 813        }
 814        /* add the ulp payload - if any. tx->ss can be NULL for acks */
 815        if (tx->ss) {
 816                ret = build_verbs_ulp_payload(sde, length, tx);
 817                if (ret)
 818                        goto bail_txadd;
 819        }
 820
 821        /* add icrc, lt byte, and padding to flit */
 822        if (extra_bytes)
 823                ret = sdma_txadd_daddr(sde->dd, &tx->txreq,
 824                                       sde->dd->sdma_pad_phys, extra_bytes);
 825
 826bail_txadd:
 827        return ret;
 828}
 829
 830static u64 update_hcrc(u8 opcode, u64 pbc)
 831{
 832        if ((opcode & IB_OPCODE_TID_RDMA) == IB_OPCODE_TID_RDMA) {
 833                pbc &= ~PBC_INSERT_HCRC_SMASK;
 834                pbc |= (u64)PBC_IHCRC_LKDETH << PBC_INSERT_HCRC_SHIFT;
 835        }
 836        return pbc;
 837}
 838
 839int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 840                        u64 pbc)
 841{
 842        struct hfi1_qp_priv *priv = qp->priv;
 843        struct hfi1_ahg_info *ahg_info = priv->s_ahg;
 844        u32 hdrwords = ps->s_txreq->hdr_dwords;
 845        u32 len = ps->s_txreq->s_cur_size;
 846        u32 plen;
 847        struct hfi1_ibdev *dev = ps->dev;
 848        struct hfi1_pportdata *ppd = ps->ppd;
 849        struct verbs_txreq *tx;
 850        u8 sc5 = priv->s_sc;
 851        int ret;
 852        u32 dwords;
 853
 854        if (ps->s_txreq->phdr.hdr.hdr_type) {
 855                u8 extra_bytes = hfi1_get_16b_padding((hdrwords << 2), len);
 856
 857                dwords = (len + extra_bytes + (SIZE_OF_CRC << 2) +
 858                          SIZE_OF_LT) >> 2;
 859        } else {
 860                dwords = (len + 3) >> 2;
 861        }
 862        plen = hdrwords + dwords + sizeof(pbc) / 4;
 863
 864        tx = ps->s_txreq;
 865        if (!sdma_txreq_built(&tx->txreq)) {
 866                if (likely(pbc == 0)) {
 867                        u32 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5);
 868
 869                        /* No vl15 here */
 870                        /* set PBC_DC_INFO bit (aka SC[4]) in pbc */
 871                        if (ps->s_txreq->phdr.hdr.hdr_type)
 872                                pbc |= PBC_PACKET_BYPASS |
 873                                       PBC_INSERT_BYPASS_ICRC;
 874                        else
 875                                pbc |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT);
 876
 877                        pbc = create_pbc(ppd,
 878                                         pbc,
 879                                         qp->srate_mbps,
 880                                         vl,
 881                                         plen);
 882
 883                        if (unlikely(hfi1_dbg_should_fault_tx(qp, ps->opcode)))
 884                                pbc = hfi1_fault_tx(qp, ps->opcode, pbc);
 885                        else
 886                                /* Update HCRC based on packet opcode */
 887                                pbc = update_hcrc(ps->opcode, pbc);
 888                }
 889                tx->wqe = qp->s_wqe;
 890                ret = build_verbs_tx_desc(tx->sde, len, tx, ahg_info, pbc);
 891                if (unlikely(ret))
 892                        goto bail_build;
 893        }
 894        ret =  sdma_send_txreq(tx->sde, ps->wait, &tx->txreq, ps->pkts_sent);
 895        if (unlikely(ret < 0)) {
 896                if (ret == -ECOMM)
 897                        goto bail_ecomm;
 898                return ret;
 899        }
 900
 901        update_tx_opstats(qp, ps, plen);
 902        trace_sdma_output_ibhdr(dd_from_ibdev(qp->ibqp.device),
 903                                &ps->s_txreq->phdr.hdr, ib_is_sc5(sc5));
 904        return ret;
 905
 906bail_ecomm:
 907        /* The current one got "sent" */
 908        return 0;
 909bail_build:
 910        ret = wait_kmem(dev, qp, ps);
 911        if (!ret) {
 912                /* free txreq - bad state */
 913                hfi1_put_txreq(ps->s_txreq);
 914                ps->s_txreq = NULL;
 915        }
 916        return ret;
 917}
 918
 919/*
 920 * If we are now in the error state, return zero to flush the
 921 * send work request.
 922 */
 923static int pio_wait(struct rvt_qp *qp,
 924                    struct send_context *sc,
 925                    struct hfi1_pkt_state *ps,
 926                    u32 flag)
 927{
 928        struct hfi1_qp_priv *priv = qp->priv;
 929        struct hfi1_devdata *dd = sc->dd;
 930        unsigned long flags;
 931        int ret = 0;
 932
 933        /*
 934         * Note that as soon as want_buffer() is called and
 935         * possibly before it returns, sc_piobufavail()
 936         * could be called. Therefore, put QP on the I/O wait list before
 937         * enabling the PIO avail interrupt.
 938         */
 939        spin_lock_irqsave(&qp->s_lock, flags);
 940        if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
 941                write_seqlock(&sc->waitlock);
 942                list_add_tail(&ps->s_txreq->txreq.list,
 943                              &ps->wait->tx_head);
 944                if (list_empty(&priv->s_iowait.list)) {
 945                        struct hfi1_ibdev *dev = &dd->verbs_dev;
 946                        int was_empty;
 947
 948                        dev->n_piowait += !!(flag & RVT_S_WAIT_PIO);
 949                        dev->n_piodrain += !!(flag & HFI1_S_WAIT_PIO_DRAIN);
 950                        qp->s_flags |= flag;
 951                        was_empty = list_empty(&sc->piowait);
 952                        iowait_get_priority(&priv->s_iowait);
 953                        iowait_queue(ps->pkts_sent, &priv->s_iowait,
 954                                     &sc->piowait);
 955                        priv->s_iowait.lock = &sc->waitlock;
 956                        trace_hfi1_qpsleep(qp, RVT_S_WAIT_PIO);
 957                        rvt_get_qp(qp);
 958                        /* counting: only call wantpiobuf_intr if first user */
 959                        if (was_empty)
 960                                hfi1_sc_wantpiobuf_intr(sc, 1);
 961                }
 962                write_sequnlock(&sc->waitlock);
 963                hfi1_qp_unbusy(qp, ps->wait);
 964                ret = -EBUSY;
 965        }
 966        spin_unlock_irqrestore(&qp->s_lock, flags);
 967        return ret;
 968}
 969
 970static void verbs_pio_complete(void *arg, int code)
 971{
 972        struct rvt_qp *qp = (struct rvt_qp *)arg;
 973        struct hfi1_qp_priv *priv = qp->priv;
 974
 975        if (iowait_pio_dec(&priv->s_iowait))
 976                iowait_drain_wakeup(&priv->s_iowait);
 977}
 978
 979int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
 980                        u64 pbc)
 981{
 982        struct hfi1_qp_priv *priv = qp->priv;
 983        u32 hdrwords = ps->s_txreq->hdr_dwords;
 984        struct rvt_sge_state *ss = ps->s_txreq->ss;
 985        u32 len = ps->s_txreq->s_cur_size;
 986        u32 dwords;
 987        u32 plen;
 988        struct hfi1_pportdata *ppd = ps->ppd;
 989        u32 *hdr;
 990        u8 sc5;
 991        unsigned long flags = 0;
 992        struct send_context *sc;
 993        struct pio_buf *pbuf;
 994        int wc_status = IB_WC_SUCCESS;
 995        int ret = 0;
 996        pio_release_cb cb = NULL;
 997        u8 extra_bytes = 0;
 998
 999        if (ps->s_txreq->phdr.hdr.hdr_type) {
1000                u8 pad_size = hfi1_get_16b_padding((hdrwords << 2), len);
1001
1002                extra_bytes = pad_size + (SIZE_OF_CRC << 2) + SIZE_OF_LT;
1003                dwords = (len + extra_bytes) >> 2;
1004                hdr = (u32 *)&ps->s_txreq->phdr.hdr.opah;
1005        } else {
1006                dwords = (len + 3) >> 2;
1007                hdr = (u32 *)&ps->s_txreq->phdr.hdr.ibh;
1008        }
1009        plen = hdrwords + dwords + sizeof(pbc) / 4;
1010
1011        /* only RC/UC use complete */
1012        switch (qp->ibqp.qp_type) {
1013        case IB_QPT_RC:
1014        case IB_QPT_UC:
1015                cb = verbs_pio_complete;
1016                break;
1017        default:
1018                break;
1019        }
1020
1021        /* vl15 special case taken care of in ud.c */
1022        sc5 = priv->s_sc;
1023        sc = ps->s_txreq->psc;
1024
1025        if (likely(pbc == 0)) {
1026                u8 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5);
1027
1028                /* set PBC_DC_INFO bit (aka SC[4]) in pbc */
1029                if (ps->s_txreq->phdr.hdr.hdr_type)
1030                        pbc |= PBC_PACKET_BYPASS | PBC_INSERT_BYPASS_ICRC;
1031                else
1032                        pbc |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT);
1033
1034                pbc = create_pbc(ppd, pbc, qp->srate_mbps, vl, plen);
1035                if (unlikely(hfi1_dbg_should_fault_tx(qp, ps->opcode)))
1036                        pbc = hfi1_fault_tx(qp, ps->opcode, pbc);
1037                else
1038                        /* Update HCRC based on packet opcode */
1039                        pbc = update_hcrc(ps->opcode, pbc);
1040        }
1041        if (cb)
1042                iowait_pio_inc(&priv->s_iowait);
1043        pbuf = sc_buffer_alloc(sc, plen, cb, qp);
1044        if (IS_ERR_OR_NULL(pbuf)) {
1045                if (cb)
1046                        verbs_pio_complete(qp, 0);
1047                if (IS_ERR(pbuf)) {
1048                        /*
1049                         * If we have filled the PIO buffers to capacity and are
1050                         * not in an active state this request is not going to
1051                         * go out to so just complete it with an error or else a
1052                         * ULP or the core may be stuck waiting.
1053                         */
1054                        hfi1_cdbg(
1055                                PIO,
1056                                "alloc failed. state not active, completing");
1057                        wc_status = IB_WC_GENERAL_ERR;
1058                        goto pio_bail;
1059                } else {
1060                        /*
1061                         * This is a normal occurrence. The PIO buffs are full
1062                         * up but we are still happily sending, well we could be
1063                         * so lets continue to queue the request.
1064                         */
1065                        hfi1_cdbg(PIO, "alloc failed. state active, queuing");
1066                        ret = pio_wait(qp, sc, ps, RVT_S_WAIT_PIO);
1067                        if (!ret)
1068                                /* txreq not queued - free */
1069                                goto bail;
1070                        /* tx consumed in wait */
1071                        return ret;
1072                }
1073        }
1074
1075        if (dwords == 0) {
1076                pio_copy(ppd->dd, pbuf, pbc, hdr, hdrwords);
1077        } else {
1078                seg_pio_copy_start(pbuf, pbc,
1079                                   hdr, hdrwords * 4);
1080                if (ss) {
1081                        while (len) {
1082                                void *addr = ss->sge.vaddr;
1083                                u32 slen = rvt_get_sge_length(&ss->sge, len);
1084
1085                                rvt_update_sge(ss, slen, false);
1086                                seg_pio_copy_mid(pbuf, addr, slen);
1087                                len -= slen;
1088                        }
1089                }
1090                /* add icrc, lt byte, and padding to flit */
1091                if (extra_bytes)
1092                        seg_pio_copy_mid(pbuf, ppd->dd->sdma_pad_dma,
1093                                         extra_bytes);
1094
1095                seg_pio_copy_end(pbuf);
1096        }
1097
1098        update_tx_opstats(qp, ps, plen);
1099        trace_pio_output_ibhdr(dd_from_ibdev(qp->ibqp.device),
1100                               &ps->s_txreq->phdr.hdr, ib_is_sc5(sc5));
1101
1102pio_bail:
1103        spin_lock_irqsave(&qp->s_lock, flags);
1104        if (qp->s_wqe) {
1105                rvt_send_complete(qp, qp->s_wqe, wc_status);
1106        } else if (qp->ibqp.qp_type == IB_QPT_RC) {
1107                if (unlikely(wc_status == IB_WC_GENERAL_ERR))
1108                        hfi1_rc_verbs_aborted(qp, &ps->s_txreq->phdr.hdr);
1109                hfi1_rc_send_complete(qp, &ps->s_txreq->phdr.hdr);
1110        }
1111        spin_unlock_irqrestore(&qp->s_lock, flags);
1112
1113        ret = 0;
1114
1115bail:
1116        hfi1_put_txreq(ps->s_txreq);
1117        return ret;
1118}
1119
1120/*
1121 * egress_pkey_matches_entry - return 1 if the pkey matches ent (ent
1122 * being an entry from the partition key table), return 0
1123 * otherwise. Use the matching criteria for egress partition keys
1124 * specified in the OPAv1 spec., section 9.1l.7.
1125 */
1126static inline int egress_pkey_matches_entry(u16 pkey, u16 ent)
1127{
1128        u16 mkey = pkey & PKEY_LOW_15_MASK;
1129        u16 mentry = ent & PKEY_LOW_15_MASK;
1130
1131        if (mkey == mentry) {
1132                /*
1133                 * If pkey[15] is set (full partition member),
1134                 * is bit 15 in the corresponding table element
1135                 * clear (limited member)?
1136                 */
1137                if (pkey & PKEY_MEMBER_MASK)
1138                        return !!(ent & PKEY_MEMBER_MASK);
1139                return 1;
1140        }
1141        return 0;
1142}
1143
1144/**
1145 * egress_pkey_check - check P_KEY of a packet
1146 * @ppd:  Physical IB port data
1147 * @slid: SLID for packet
1148 * @pkey: PKEY for header
1149 * @sc5:  SC for packet
1150 * @s_pkey_index: It will be used for look up optimization for kernel contexts
1151 * only. If it is negative value, then it means user contexts is calling this
1152 * function.
1153 *
1154 * It checks if hdr's pkey is valid.
1155 *
1156 * Return: 0 on success, otherwise, 1
1157 */
1158int egress_pkey_check(struct hfi1_pportdata *ppd, u32 slid, u16 pkey,
1159                      u8 sc5, int8_t s_pkey_index)
1160{
1161        struct hfi1_devdata *dd;
1162        int i;
1163        int is_user_ctxt_mechanism = (s_pkey_index < 0);
1164
1165        if (!(ppd->part_enforce & HFI1_PART_ENFORCE_OUT))
1166                return 0;
1167
1168        /* If SC15, pkey[0:14] must be 0x7fff */
1169        if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK))
1170                goto bad;
1171
1172        /* Is the pkey = 0x0, or 0x8000? */
1173        if ((pkey & PKEY_LOW_15_MASK) == 0)
1174                goto bad;
1175
1176        /*
1177         * For the kernel contexts only, if a qp is passed into the function,
1178         * the most likely matching pkey has index qp->s_pkey_index
1179         */
1180        if (!is_user_ctxt_mechanism &&
1181            egress_pkey_matches_entry(pkey, ppd->pkeys[s_pkey_index])) {
1182                return 0;
1183        }
1184
1185        for (i = 0; i < MAX_PKEY_VALUES; i++) {
1186                if (egress_pkey_matches_entry(pkey, ppd->pkeys[i]))
1187                        return 0;
1188        }
1189bad:
1190        /*
1191         * For the user-context mechanism, the P_KEY check would only happen
1192         * once per SDMA request, not once per packet.  Therefore, there's no
1193         * need to increment the counter for the user-context mechanism.
1194         */
1195        if (!is_user_ctxt_mechanism) {
1196                incr_cntr64(&ppd->port_xmit_constraint_errors);
1197                dd = ppd->dd;
1198                if (!(dd->err_info_xmit_constraint.status &
1199                      OPA_EI_STATUS_SMASK)) {
1200                        dd->err_info_xmit_constraint.status |=
1201                                OPA_EI_STATUS_SMASK;
1202                        dd->err_info_xmit_constraint.slid = slid;
1203                        dd->err_info_xmit_constraint.pkey = pkey;
1204                }
1205        }
1206        return 1;
1207}
1208
1209/*
1210 * get_send_routine - choose an egress routine
1211 *
1212 * Choose an egress routine based on QP type
1213 * and size
1214 */
1215static inline send_routine get_send_routine(struct rvt_qp *qp,
1216                                            struct hfi1_pkt_state *ps)
1217{
1218        struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
1219        struct hfi1_qp_priv *priv = qp->priv;
1220        struct verbs_txreq *tx = ps->s_txreq;
1221
1222        if (unlikely(!(dd->flags & HFI1_HAS_SEND_DMA)))
1223                return dd->process_pio_send;
1224        switch (qp->ibqp.qp_type) {
1225        case IB_QPT_SMI:
1226                return dd->process_pio_send;
1227        case IB_QPT_GSI:
1228        case IB_QPT_UD:
1229                break;
1230        case IB_QPT_UC:
1231        case IB_QPT_RC:
1232                priv->s_running_pkt_size =
1233                        (tx->s_cur_size + priv->s_running_pkt_size) / 2;
1234                if (piothreshold &&
1235                    priv->s_running_pkt_size <= min(piothreshold, qp->pmtu) &&
1236                    (BIT(ps->opcode & OPMASK) & pio_opmask[ps->opcode >> 5]) &&
1237                    iowait_sdma_pending(&priv->s_iowait) == 0 &&
1238                    !sdma_txreq_built(&tx->txreq))
1239                        return dd->process_pio_send;
1240                break;
1241        default:
1242                break;
1243        }
1244        return dd->process_dma_send;
1245}
1246
1247/**
1248 * hfi1_verbs_send - send a packet
1249 * @qp: the QP to send on
1250 * @ps: the state of the packet to send
1251 *
1252 * Return zero if packet is sent or queued OK.
1253 * Return non-zero and clear qp->s_flags RVT_S_BUSY otherwise.
1254 */
1255int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
1256{
1257        struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
1258        struct hfi1_qp_priv *priv = qp->priv;
1259        struct ib_other_headers *ohdr = NULL;
1260        send_routine sr;
1261        int ret;
1262        u16 pkey;
1263        u32 slid;
1264        u8 l4 = 0;
1265
1266        /* locate the pkey within the headers */
1267        if (ps->s_txreq->phdr.hdr.hdr_type) {
1268                struct hfi1_16b_header *hdr = &ps->s_txreq->phdr.hdr.opah;
1269
1270                l4 = hfi1_16B_get_l4(hdr);
1271                if (l4 == OPA_16B_L4_IB_LOCAL)
1272                        ohdr = &hdr->u.oth;
1273                else if (l4 == OPA_16B_L4_IB_GLOBAL)
1274                        ohdr = &hdr->u.l.oth;
1275
1276                slid = hfi1_16B_get_slid(hdr);
1277                pkey = hfi1_16B_get_pkey(hdr);
1278        } else {
1279                struct ib_header *hdr = &ps->s_txreq->phdr.hdr.ibh;
1280                u8 lnh = ib_get_lnh(hdr);
1281
1282                if (lnh == HFI1_LRH_GRH)
1283                        ohdr = &hdr->u.l.oth;
1284                else
1285                        ohdr = &hdr->u.oth;
1286                slid = ib_get_slid(hdr);
1287                pkey = ib_bth_get_pkey(ohdr);
1288        }
1289
1290        if (likely(l4 != OPA_16B_L4_FM))
1291                ps->opcode = ib_bth_get_opcode(ohdr);
1292        else
1293                ps->opcode = IB_OPCODE_UD_SEND_ONLY;
1294
1295        sr = get_send_routine(qp, ps);
1296        ret = egress_pkey_check(dd->pport, slid, pkey,
1297                                priv->s_sc, qp->s_pkey_index);
1298        if (unlikely(ret)) {
1299                /*
1300                 * The value we are returning here does not get propagated to
1301                 * the verbs caller. Thus we need to complete the request with
1302                 * error otherwise the caller could be sitting waiting on the
1303                 * completion event. Only do this for PIO. SDMA has its own
1304                 * mechanism for handling the errors. So for SDMA we can just
1305                 * return.
1306                 */
1307                if (sr == dd->process_pio_send) {
1308                        unsigned long flags;
1309
1310                        hfi1_cdbg(PIO, "%s() Failed. Completing with err",
1311                                  __func__);
1312                        spin_lock_irqsave(&qp->s_lock, flags);
1313                        rvt_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR);
1314                        spin_unlock_irqrestore(&qp->s_lock, flags);
1315                }
1316                return -EINVAL;
1317        }
1318        if (sr == dd->process_dma_send && iowait_pio_pending(&priv->s_iowait))
1319                return pio_wait(qp,
1320                                ps->s_txreq->psc,
1321                                ps,
1322                                HFI1_S_WAIT_PIO_DRAIN);
1323        return sr(qp, ps, 0);
1324}
1325
1326/**
1327 * hfi1_fill_device_attr - Fill in rvt dev info device attributes.
1328 * @dd: the device data structure
1329 */
1330static void hfi1_fill_device_attr(struct hfi1_devdata *dd)
1331{
1332        struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
1333        u32 ver = dd->dc8051_ver;
1334
1335        memset(&rdi->dparms.props, 0, sizeof(rdi->dparms.props));
1336
1337        rdi->dparms.props.fw_ver = ((u64)(dc8051_ver_maj(ver)) << 32) |
1338                ((u64)(dc8051_ver_min(ver)) << 16) |
1339                (u64)dc8051_ver_patch(ver);
1340
1341        rdi->dparms.props.device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
1342                        IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
1343                        IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
1344                        IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE |
1345                        IB_DEVICE_MEM_MGT_EXTENSIONS |
1346                        IB_DEVICE_RDMA_NETDEV_OPA;
1347        rdi->dparms.props.page_size_cap = PAGE_SIZE;
1348        rdi->dparms.props.vendor_id = dd->oui1 << 16 | dd->oui2 << 8 | dd->oui3;
1349        rdi->dparms.props.vendor_part_id = dd->pcidev->device;
1350        rdi->dparms.props.hw_ver = dd->minrev;
1351        rdi->dparms.props.sys_image_guid = ib_hfi1_sys_image_guid;
1352        rdi->dparms.props.max_mr_size = U64_MAX;
1353        rdi->dparms.props.max_fast_reg_page_list_len = UINT_MAX;
1354        rdi->dparms.props.max_qp = hfi1_max_qps;
1355        rdi->dparms.props.max_qp_wr =
1356                (hfi1_max_qp_wrs >= HFI1_QP_WQE_INVALID ?
1357                 HFI1_QP_WQE_INVALID - 1 : hfi1_max_qp_wrs);
1358        rdi->dparms.props.max_send_sge = hfi1_max_sges;
1359        rdi->dparms.props.max_recv_sge = hfi1_max_sges;
1360        rdi->dparms.props.max_sge_rd = hfi1_max_sges;
1361        rdi->dparms.props.max_cq = hfi1_max_cqs;
1362        rdi->dparms.props.max_ah = hfi1_max_ahs;
1363        rdi->dparms.props.max_cqe = hfi1_max_cqes;
1364        rdi->dparms.props.max_pd = hfi1_max_pds;
1365        rdi->dparms.props.max_qp_rd_atom = HFI1_MAX_RDMA_ATOMIC;
1366        rdi->dparms.props.max_qp_init_rd_atom = 255;
1367        rdi->dparms.props.max_srq = hfi1_max_srqs;
1368        rdi->dparms.props.max_srq_wr = hfi1_max_srq_wrs;
1369        rdi->dparms.props.max_srq_sge = hfi1_max_srq_sges;
1370        rdi->dparms.props.atomic_cap = IB_ATOMIC_GLOB;
1371        rdi->dparms.props.max_pkeys = hfi1_get_npkeys(dd);
1372        rdi->dparms.props.max_mcast_grp = hfi1_max_mcast_grps;
1373        rdi->dparms.props.max_mcast_qp_attach = hfi1_max_mcast_qp_attached;
1374        rdi->dparms.props.max_total_mcast_qp_attach =
1375                                        rdi->dparms.props.max_mcast_qp_attach *
1376                                        rdi->dparms.props.max_mcast_grp;
1377}
1378
1379static inline u16 opa_speed_to_ib(u16 in)
1380{
1381        u16 out = 0;
1382
1383        if (in & OPA_LINK_SPEED_25G)
1384                out |= IB_SPEED_EDR;
1385        if (in & OPA_LINK_SPEED_12_5G)
1386                out |= IB_SPEED_FDR;
1387
1388        return out;
1389}
1390
1391/*
1392 * Convert a single OPA link width (no multiple flags) to an IB value.
1393 * A zero OPA link width means link down, which means the IB width value
1394 * is a don't care.
1395 */
1396static inline u16 opa_width_to_ib(u16 in)
1397{
1398        switch (in) {
1399        case OPA_LINK_WIDTH_1X:
1400        /* map 2x and 3x to 1x as they don't exist in IB */
1401        case OPA_LINK_WIDTH_2X:
1402        case OPA_LINK_WIDTH_3X:
1403                return IB_WIDTH_1X;
1404        default: /* link down or unknown, return our largest width */
1405        case OPA_LINK_WIDTH_4X:
1406                return IB_WIDTH_4X;
1407        }
1408}
1409
1410static int query_port(struct rvt_dev_info *rdi, u32 port_num,
1411                      struct ib_port_attr *props)
1412{
1413        struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
1414        struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
1415        struct hfi1_pportdata *ppd = &dd->pport[port_num - 1];
1416        u32 lid = ppd->lid;
1417
1418        /* props being zeroed by the caller, avoid zeroing it here */
1419        props->lid = lid ? lid : 0;
1420        props->lmc = ppd->lmc;
1421        /* OPA logical states match IB logical states */
1422        props->state = driver_lstate(ppd);
1423        props->phys_state = driver_pstate(ppd);
1424        props->gid_tbl_len = HFI1_GUIDS_PER_PORT;
1425        props->active_width = (u8)opa_width_to_ib(ppd->link_width_active);
1426        /* see rate_show() in ib core/sysfs.c */
1427        props->active_speed = opa_speed_to_ib(ppd->link_speed_active);
1428        props->max_vl_num = ppd->vls_supported;
1429
1430        /* Once we are a "first class" citizen and have added the OPA MTUs to
1431         * the core we can advertise the larger MTU enum to the ULPs, for now
1432         * advertise only 4K.
1433         *
1434         * Those applications which are either OPA aware or pass the MTU enum
1435         * from the Path Records to us will get the new 8k MTU.  Those that
1436         * attempt to process the MTU enum may fail in various ways.
1437         */
1438        props->max_mtu = mtu_to_enum((!valid_ib_mtu(hfi1_max_mtu) ?
1439                                      4096 : hfi1_max_mtu), IB_MTU_4096);
1440        props->active_mtu = !valid_ib_mtu(ppd->ibmtu) ? props->max_mtu :
1441                mtu_to_enum(ppd->ibmtu, IB_MTU_4096);
1442        props->phys_mtu = HFI1_CAP_IS_KSET(AIP) ? hfi1_max_mtu :
1443                                ib_mtu_enum_to_int(props->max_mtu);
1444
1445        return 0;
1446}
1447
1448static int modify_device(struct ib_device *device,
1449                         int device_modify_mask,
1450                         struct ib_device_modify *device_modify)
1451{
1452        struct hfi1_devdata *dd = dd_from_ibdev(device);
1453        unsigned i;
1454        int ret;
1455
1456        if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID |
1457                                   IB_DEVICE_MODIFY_NODE_DESC)) {
1458                ret = -EOPNOTSUPP;
1459                goto bail;
1460        }
1461
1462        if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC) {
1463                memcpy(device->node_desc, device_modify->node_desc,
1464                       IB_DEVICE_NODE_DESC_MAX);
1465                for (i = 0; i < dd->num_pports; i++) {
1466                        struct hfi1_ibport *ibp = &dd->pport[i].ibport_data;
1467
1468                        hfi1_node_desc_chg(ibp);
1469                }
1470        }
1471
1472        if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) {
1473                ib_hfi1_sys_image_guid =
1474                        cpu_to_be64(device_modify->sys_image_guid);
1475                for (i = 0; i < dd->num_pports; i++) {
1476                        struct hfi1_ibport *ibp = &dd->pport[i].ibport_data;
1477
1478                        hfi1_sys_guid_chg(ibp);
1479                }
1480        }
1481
1482        ret = 0;
1483
1484bail:
1485        return ret;
1486}
1487
1488static int shut_down_port(struct rvt_dev_info *rdi, u32 port_num)
1489{
1490        struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
1491        struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
1492        struct hfi1_pportdata *ppd = &dd->pport[port_num - 1];
1493        int ret;
1494
1495        set_link_down_reason(ppd, OPA_LINKDOWN_REASON_UNKNOWN, 0,
1496                             OPA_LINKDOWN_REASON_UNKNOWN);
1497        ret = set_link_state(ppd, HLS_DN_DOWNDEF);
1498        return ret;
1499}
1500
1501static int hfi1_get_guid_be(struct rvt_dev_info *rdi, struct rvt_ibport *rvp,
1502                            int guid_index, __be64 *guid)
1503{
1504        struct hfi1_ibport *ibp = container_of(rvp, struct hfi1_ibport, rvp);
1505
1506        if (guid_index >= HFI1_GUIDS_PER_PORT)
1507                return -EINVAL;
1508
1509        *guid = get_sguid(ibp, guid_index);
1510        return 0;
1511}
1512
1513/*
1514 * convert ah port,sl to sc
1515 */
1516u8 ah_to_sc(struct ib_device *ibdev, struct rdma_ah_attr *ah)
1517{
1518        struct hfi1_ibport *ibp = to_iport(ibdev, rdma_ah_get_port_num(ah));
1519
1520        return ibp->sl_to_sc[rdma_ah_get_sl(ah)];
1521}
1522
1523static int hfi1_check_ah(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr)
1524{
1525        struct hfi1_ibport *ibp;
1526        struct hfi1_pportdata *ppd;
1527        struct hfi1_devdata *dd;
1528        u8 sc5;
1529        u8 sl;
1530
1531        if (hfi1_check_mcast(rdma_ah_get_dlid(ah_attr)) &&
1532            !(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH))
1533                return -EINVAL;
1534
1535        /* test the mapping for validity */
1536        ibp = to_iport(ibdev, rdma_ah_get_port_num(ah_attr));
1537        ppd = ppd_from_ibp(ibp);
1538        dd = dd_from_ppd(ppd);
1539
1540        sl = rdma_ah_get_sl(ah_attr);
1541        if (sl >= ARRAY_SIZE(ibp->sl_to_sc))
1542                return -EINVAL;
1543        sl = array_index_nospec(sl, ARRAY_SIZE(ibp->sl_to_sc));
1544
1545        sc5 = ibp->sl_to_sc[sl];
1546        if (sc_to_vlt(dd, sc5) > num_vls && sc_to_vlt(dd, sc5) != 0xf)
1547                return -EINVAL;
1548        return 0;
1549}
1550
1551static void hfi1_notify_new_ah(struct ib_device *ibdev,
1552                               struct rdma_ah_attr *ah_attr,
1553                               struct rvt_ah *ah)
1554{
1555        struct hfi1_ibport *ibp;
1556        struct hfi1_pportdata *ppd;
1557        struct hfi1_devdata *dd;
1558        u8 sc5;
1559        struct rdma_ah_attr *attr = &ah->attr;
1560
1561        /*
1562         * Do not trust reading anything from rvt_ah at this point as it is not
1563         * done being setup. We can however modify things which we need to set.
1564         */
1565
1566        ibp = to_iport(ibdev, rdma_ah_get_port_num(ah_attr));
1567        ppd = ppd_from_ibp(ibp);
1568        sc5 = ibp->sl_to_sc[rdma_ah_get_sl(&ah->attr)];
1569        hfi1_update_ah_attr(ibdev, attr);
1570        hfi1_make_opa_lid(attr);
1571        dd = dd_from_ppd(ppd);
1572        ah->vl = sc_to_vlt(dd, sc5);
1573        if (ah->vl < num_vls || ah->vl == 15)
1574                ah->log_pmtu = ilog2(dd->vld[ah->vl].mtu);
1575}
1576
1577/**
1578 * hfi1_get_npkeys - return the size of the PKEY table for context 0
1579 * @dd: the hfi1_ib device
1580 */
1581unsigned hfi1_get_npkeys(struct hfi1_devdata *dd)
1582{
1583        return ARRAY_SIZE(dd->pport[0].pkeys);
1584}
1585
1586static void init_ibport(struct hfi1_pportdata *ppd)
1587{
1588        struct hfi1_ibport *ibp = &ppd->ibport_data;
1589        size_t sz = ARRAY_SIZE(ibp->sl_to_sc);
1590        int i;
1591
1592        for (i = 0; i < sz; i++) {
1593                ibp->sl_to_sc[i] = i;
1594                ibp->sc_to_sl[i] = i;
1595        }
1596
1597        for (i = 0; i < RVT_MAX_TRAP_LISTS ; i++)
1598                INIT_LIST_HEAD(&ibp->rvp.trap_lists[i].list);
1599        timer_setup(&ibp->rvp.trap_timer, hfi1_handle_trap_timer, 0);
1600
1601        spin_lock_init(&ibp->rvp.lock);
1602        /* Set the prefix to the default value (see ch. 4.1.1) */
1603        ibp->rvp.gid_prefix = IB_DEFAULT_GID_PREFIX;
1604        ibp->rvp.sm_lid = 0;
1605        /*
1606         * Below should only set bits defined in OPA PortInfo.CapabilityMask
1607         * and PortInfo.CapabilityMask3
1608         */
1609        ibp->rvp.port_cap_flags = IB_PORT_AUTO_MIGR_SUP |
1610                IB_PORT_CAP_MASK_NOTICE_SUP;
1611        ibp->rvp.port_cap3_flags = OPA_CAP_MASK3_IsSharedSpaceSupported;
1612        ibp->rvp.pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA;
1613        ibp->rvp.pma_counter_select[1] = IB_PMA_PORT_RCV_DATA;
1614        ibp->rvp.pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS;
1615        ibp->rvp.pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS;
1616        ibp->rvp.pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT;
1617
1618        RCU_INIT_POINTER(ibp->rvp.qp[0], NULL);
1619        RCU_INIT_POINTER(ibp->rvp.qp[1], NULL);
1620}
1621
1622static void hfi1_get_dev_fw_str(struct ib_device *ibdev, char *str)
1623{
1624        struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
1625        struct hfi1_ibdev *dev = dev_from_rdi(rdi);
1626        u32 ver = dd_from_dev(dev)->dc8051_ver;
1627
1628        snprintf(str, IB_FW_VERSION_NAME_MAX, "%u.%u.%u", dc8051_ver_maj(ver),
1629                 dc8051_ver_min(ver), dc8051_ver_patch(ver));
1630}
1631
1632static const char * const driver_cntr_names[] = {
1633        /* must be element 0*/
1634        "DRIVER_KernIntr",
1635        "DRIVER_ErrorIntr",
1636        "DRIVER_Tx_Errs",
1637        "DRIVER_Rcv_Errs",
1638        "DRIVER_HW_Errs",
1639        "DRIVER_NoPIOBufs",
1640        "DRIVER_CtxtsOpen",
1641        "DRIVER_RcvLen_Errs",
1642        "DRIVER_EgrBufFull",
1643        "DRIVER_EgrHdrFull"
1644};
1645
1646static DEFINE_MUTEX(cntr_names_lock); /* protects the *_cntr_names bufers */
1647static const char **dev_cntr_names;
1648static const char **port_cntr_names;
1649int num_driver_cntrs = ARRAY_SIZE(driver_cntr_names);
1650static int num_dev_cntrs;
1651static int num_port_cntrs;
1652static int cntr_names_initialized;
1653
1654/*
1655 * Convert a list of names separated by '\n' into an array of NULL terminated
1656 * strings. Optionally some entries can be reserved in the array to hold extra
1657 * external strings.
1658 */
1659static int init_cntr_names(const char *names_in,
1660                           const size_t names_len,
1661                           int num_extra_names,
1662                           int *num_cntrs,
1663                           const char ***cntr_names)
1664{
1665        char *names_out, *p, **q;
1666        int i, n;
1667
1668        n = 0;
1669        for (i = 0; i < names_len; i++)
1670                if (names_in[i] == '\n')
1671                        n++;
1672
1673        names_out = kmalloc((n + num_extra_names) * sizeof(char *) + names_len,
1674                            GFP_KERNEL);
1675        if (!names_out) {
1676                *num_cntrs = 0;
1677                *cntr_names = NULL;
1678                return -ENOMEM;
1679        }
1680
1681        p = names_out + (n + num_extra_names) * sizeof(char *);
1682        memcpy(p, names_in, names_len);
1683
1684        q = (char **)names_out;
1685        for (i = 0; i < n; i++) {
1686                q[i] = p;
1687                p = strchr(p, '\n');
1688                *p++ = '\0';
1689        }
1690
1691        *num_cntrs = n;
1692        *cntr_names = (const char **)names_out;
1693        return 0;
1694}
1695
1696static int init_counters(struct ib_device *ibdev)
1697{
1698        struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
1699        int i, err = 0;
1700
1701        mutex_lock(&cntr_names_lock);
1702        if (cntr_names_initialized)
1703                goto out_unlock;
1704
1705        err = init_cntr_names(dd->cntrnames, dd->cntrnameslen, num_driver_cntrs,
1706                              &num_dev_cntrs, &dev_cntr_names);
1707        if (err)
1708                goto out_unlock;
1709
1710        for (i = 0; i < num_driver_cntrs; i++)
1711                dev_cntr_names[num_dev_cntrs + i] = driver_cntr_names[i];
1712
1713        err = init_cntr_names(dd->portcntrnames, dd->portcntrnameslen, 0,
1714                              &num_port_cntrs, &port_cntr_names);
1715        if (err) {
1716                kfree(dev_cntr_names);
1717                dev_cntr_names = NULL;
1718                goto out_unlock;
1719        }
1720        cntr_names_initialized = 1;
1721
1722out_unlock:
1723        mutex_unlock(&cntr_names_lock);
1724        return err;
1725}
1726
1727static struct rdma_hw_stats *hfi1_alloc_hw_device_stats(struct ib_device *ibdev)
1728{
1729        if (init_counters(ibdev))
1730                return NULL;
1731        return rdma_alloc_hw_stats_struct(dev_cntr_names,
1732                                          num_dev_cntrs + num_driver_cntrs,
1733                                          RDMA_HW_STATS_DEFAULT_LIFESPAN);
1734}
1735
1736static struct rdma_hw_stats *hfi_alloc_hw_port_stats(struct ib_device *ibdev,
1737                                                     u32 port_num)
1738{
1739        if (init_counters(ibdev))
1740                return NULL;
1741        return rdma_alloc_hw_stats_struct(port_cntr_names, num_port_cntrs,
1742                                          RDMA_HW_STATS_DEFAULT_LIFESPAN);
1743}
1744
1745static u64 hfi1_sps_ints(void)
1746{
1747        unsigned long index, flags;
1748        struct hfi1_devdata *dd;
1749        u64 sps_ints = 0;
1750
1751        xa_lock_irqsave(&hfi1_dev_table, flags);
1752        xa_for_each(&hfi1_dev_table, index, dd) {
1753                sps_ints += get_all_cpu_total(dd->int_counter);
1754        }
1755        xa_unlock_irqrestore(&hfi1_dev_table, flags);
1756        return sps_ints;
1757}
1758
1759static int get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats,
1760                        u32 port, int index)
1761{
1762        u64 *values;
1763        int count;
1764
1765        if (!port) {
1766                u64 *stats = (u64 *)&hfi1_stats;
1767                int i;
1768
1769                hfi1_read_cntrs(dd_from_ibdev(ibdev), NULL, &values);
1770                values[num_dev_cntrs] = hfi1_sps_ints();
1771                for (i = 1; i < num_driver_cntrs; i++)
1772                        values[num_dev_cntrs + i] = stats[i];
1773                count = num_dev_cntrs + num_driver_cntrs;
1774        } else {
1775                struct hfi1_ibport *ibp = to_iport(ibdev, port);
1776
1777                hfi1_read_portcntrs(ppd_from_ibp(ibp), NULL, &values);
1778                count = num_port_cntrs;
1779        }
1780
1781        memcpy(stats->value, values, count * sizeof(u64));
1782        return count;
1783}
1784
1785static const struct ib_device_ops hfi1_dev_ops = {
1786        .owner = THIS_MODULE,
1787        .driver_id = RDMA_DRIVER_HFI1,
1788
1789        .alloc_hw_device_stats = hfi1_alloc_hw_device_stats,
1790        .alloc_hw_port_stats = hfi_alloc_hw_port_stats,
1791        .alloc_rdma_netdev = hfi1_vnic_alloc_rn,
1792        .device_group = &ib_hfi1_attr_group,
1793        .get_dev_fw_str = hfi1_get_dev_fw_str,
1794        .get_hw_stats = get_hw_stats,
1795        .modify_device = modify_device,
1796        .port_groups = hfi1_attr_port_groups,
1797        /* keep process mad in the driver */
1798        .process_mad = hfi1_process_mad,
1799        .rdma_netdev_get_params = hfi1_ipoib_rn_get_params,
1800};
1801
1802/**
1803 * hfi1_register_ib_device - register our device with the infiniband core
1804 * @dd: the device data structure
1805 * Return 0 if successful, errno if unsuccessful.
1806 */
1807int hfi1_register_ib_device(struct hfi1_devdata *dd)
1808{
1809        struct hfi1_ibdev *dev = &dd->verbs_dev;
1810        struct ib_device *ibdev = &dev->rdi.ibdev;
1811        struct hfi1_pportdata *ppd = dd->pport;
1812        struct hfi1_ibport *ibp = &ppd->ibport_data;
1813        unsigned i;
1814        int ret;
1815
1816        for (i = 0; i < dd->num_pports; i++)
1817                init_ibport(ppd + i);
1818
1819        /* Only need to initialize non-zero fields. */
1820
1821        timer_setup(&dev->mem_timer, mem_timer, 0);
1822
1823        seqlock_init(&dev->iowait_lock);
1824        seqlock_init(&dev->txwait_lock);
1825        INIT_LIST_HEAD(&dev->txwait);
1826        INIT_LIST_HEAD(&dev->memwait);
1827
1828        ret = verbs_txreq_init(dev);
1829        if (ret)
1830                goto err_verbs_txreq;
1831
1832        /* Use first-port GUID as node guid */
1833        ibdev->node_guid = get_sguid(ibp, HFI1_PORT_GUID_INDEX);
1834
1835        /*
1836         * The system image GUID is supposed to be the same for all
1837         * HFIs in a single system but since there can be other
1838         * device types in the system, we can't be sure this is unique.
1839         */
1840        if (!ib_hfi1_sys_image_guid)
1841                ib_hfi1_sys_image_guid = ibdev->node_guid;
1842        ibdev->phys_port_cnt = dd->num_pports;
1843        ibdev->dev.parent = &dd->pcidev->dev;
1844
1845        ib_set_device_ops(ibdev, &hfi1_dev_ops);
1846
1847        strlcpy(ibdev->node_desc, init_utsname()->nodename,
1848                sizeof(ibdev->node_desc));
1849
1850        /*
1851         * Fill in rvt info object.
1852         */
1853        dd->verbs_dev.rdi.driver_f.get_pci_dev = get_pci_dev;
1854        dd->verbs_dev.rdi.driver_f.check_ah = hfi1_check_ah;
1855        dd->verbs_dev.rdi.driver_f.notify_new_ah = hfi1_notify_new_ah;
1856        dd->verbs_dev.rdi.driver_f.get_guid_be = hfi1_get_guid_be;
1857        dd->verbs_dev.rdi.driver_f.query_port_state = query_port;
1858        dd->verbs_dev.rdi.driver_f.shut_down_port = shut_down_port;
1859        dd->verbs_dev.rdi.driver_f.cap_mask_chg = hfi1_cap_mask_chg;
1860        /*
1861         * Fill in rvt info device attributes.
1862         */
1863        hfi1_fill_device_attr(dd);
1864
1865        /* queue pair */
1866        dd->verbs_dev.rdi.dparms.qp_table_size = hfi1_qp_table_size;
1867        dd->verbs_dev.rdi.dparms.qpn_start = 0;
1868        dd->verbs_dev.rdi.dparms.qpn_inc = 1;
1869        dd->verbs_dev.rdi.dparms.qos_shift = dd->qos_shift;
1870        dd->verbs_dev.rdi.dparms.qpn_res_start = RVT_KDETH_QP_BASE;
1871        dd->verbs_dev.rdi.dparms.qpn_res_end = RVT_AIP_QP_MAX;
1872        dd->verbs_dev.rdi.dparms.max_rdma_atomic = HFI1_MAX_RDMA_ATOMIC;
1873        dd->verbs_dev.rdi.dparms.psn_mask = PSN_MASK;
1874        dd->verbs_dev.rdi.dparms.psn_shift = PSN_SHIFT;
1875        dd->verbs_dev.rdi.dparms.psn_modify_mask = PSN_MODIFY_MASK;
1876        dd->verbs_dev.rdi.dparms.core_cap_flags = RDMA_CORE_PORT_INTEL_OPA |
1877                                                RDMA_CORE_CAP_OPA_AH;
1878        dd->verbs_dev.rdi.dparms.max_mad_size = OPA_MGMT_MAD_SIZE;
1879
1880        dd->verbs_dev.rdi.driver_f.qp_priv_alloc = qp_priv_alloc;
1881        dd->verbs_dev.rdi.driver_f.qp_priv_init = hfi1_qp_priv_init;
1882        dd->verbs_dev.rdi.driver_f.qp_priv_free = qp_priv_free;
1883        dd->verbs_dev.rdi.driver_f.free_all_qps = free_all_qps;
1884        dd->verbs_dev.rdi.driver_f.notify_qp_reset = notify_qp_reset;
1885        dd->verbs_dev.rdi.driver_f.do_send = hfi1_do_send_from_rvt;
1886        dd->verbs_dev.rdi.driver_f.schedule_send = hfi1_schedule_send;
1887        dd->verbs_dev.rdi.driver_f.schedule_send_no_lock = _hfi1_schedule_send;
1888        dd->verbs_dev.rdi.driver_f.get_pmtu_from_attr = get_pmtu_from_attr;
1889        dd->verbs_dev.rdi.driver_f.notify_error_qp = notify_error_qp;
1890        dd->verbs_dev.rdi.driver_f.flush_qp_waiters = flush_qp_waiters;
1891        dd->verbs_dev.rdi.driver_f.stop_send_queue = stop_send_queue;
1892        dd->verbs_dev.rdi.driver_f.quiesce_qp = quiesce_qp;
1893        dd->verbs_dev.rdi.driver_f.notify_error_qp = notify_error_qp;
1894        dd->verbs_dev.rdi.driver_f.mtu_from_qp = mtu_from_qp;
1895        dd->verbs_dev.rdi.driver_f.mtu_to_path_mtu = mtu_to_path_mtu;
1896        dd->verbs_dev.rdi.driver_f.check_modify_qp = hfi1_check_modify_qp;
1897        dd->verbs_dev.rdi.driver_f.modify_qp = hfi1_modify_qp;
1898        dd->verbs_dev.rdi.driver_f.notify_restart_rc = hfi1_restart_rc;
1899        dd->verbs_dev.rdi.driver_f.setup_wqe = hfi1_setup_wqe;
1900        dd->verbs_dev.rdi.driver_f.comp_vect_cpu_lookup =
1901                                                hfi1_comp_vect_mappings_lookup;
1902
1903        /* completeion queue */
1904        dd->verbs_dev.rdi.ibdev.num_comp_vectors = dd->comp_vect_possible_cpus;
1905        dd->verbs_dev.rdi.dparms.node = dd->node;
1906
1907        /* misc settings */
1908        dd->verbs_dev.rdi.flags = 0; /* Let rdmavt handle it all */
1909        dd->verbs_dev.rdi.dparms.lkey_table_size = hfi1_lkey_table_size;
1910        dd->verbs_dev.rdi.dparms.nports = dd->num_pports;
1911        dd->verbs_dev.rdi.dparms.npkeys = hfi1_get_npkeys(dd);
1912        dd->verbs_dev.rdi.dparms.sge_copy_mode = sge_copy_mode;
1913        dd->verbs_dev.rdi.dparms.wss_threshold = wss_threshold;
1914        dd->verbs_dev.rdi.dparms.wss_clean_period = wss_clean_period;
1915        dd->verbs_dev.rdi.dparms.reserved_operations = 1;
1916        dd->verbs_dev.rdi.dparms.extra_rdma_atomic = HFI1_TID_RDMA_WRITE_CNT;
1917
1918        /* post send table */
1919        dd->verbs_dev.rdi.post_parms = hfi1_post_parms;
1920
1921        /* opcode translation table */
1922        dd->verbs_dev.rdi.wc_opcode = ib_hfi1_wc_opcode;
1923
1924        ppd = dd->pport;
1925        for (i = 0; i < dd->num_pports; i++, ppd++)
1926                rvt_init_port(&dd->verbs_dev.rdi,
1927                              &ppd->ibport_data.rvp,
1928                              i,
1929                              ppd->pkeys);
1930
1931        ret = rvt_register_device(&dd->verbs_dev.rdi);
1932        if (ret)
1933                goto err_verbs_txreq;
1934
1935        ret = hfi1_verbs_register_sysfs(dd);
1936        if (ret)
1937                goto err_class;
1938
1939        return ret;
1940
1941err_class:
1942        rvt_unregister_device(&dd->verbs_dev.rdi);
1943err_verbs_txreq:
1944        verbs_txreq_exit(dev);
1945        dd_dev_err(dd, "cannot register verbs: %d!\n", -ret);
1946        return ret;
1947}
1948
1949void hfi1_unregister_ib_device(struct hfi1_devdata *dd)
1950{
1951        struct hfi1_ibdev *dev = &dd->verbs_dev;
1952
1953        hfi1_verbs_unregister_sysfs(dd);
1954
1955        rvt_unregister_device(&dd->verbs_dev.rdi);
1956
1957        if (!list_empty(&dev->txwait))
1958                dd_dev_err(dd, "txwait list not empty!\n");
1959        if (!list_empty(&dev->memwait))
1960                dd_dev_err(dd, "memwait list not empty!\n");
1961
1962        del_timer_sync(&dev->mem_timer);
1963        verbs_txreq_exit(dev);
1964
1965        mutex_lock(&cntr_names_lock);
1966        kfree(dev_cntr_names);
1967        kfree(port_cntr_names);
1968        dev_cntr_names = NULL;
1969        port_cntr_names = NULL;
1970        cntr_names_initialized = 0;
1971        mutex_unlock(&cntr_names_lock);
1972}
1973
1974void hfi1_cnp_rcv(struct hfi1_packet *packet)
1975{
1976        struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd);
1977        struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
1978        struct ib_header *hdr = packet->hdr;
1979        struct rvt_qp *qp = packet->qp;
1980        u32 lqpn, rqpn = 0;
1981        u16 rlid = 0;
1982        u8 sl, sc5, svc_type;
1983
1984        switch (packet->qp->ibqp.qp_type) {
1985        case IB_QPT_UC:
1986                rlid = rdma_ah_get_dlid(&qp->remote_ah_attr);
1987                rqpn = qp->remote_qpn;
1988                svc_type = IB_CC_SVCTYPE_UC;
1989                break;
1990        case IB_QPT_RC:
1991                rlid = rdma_ah_get_dlid(&qp->remote_ah_attr);
1992                rqpn = qp->remote_qpn;
1993                svc_type = IB_CC_SVCTYPE_RC;
1994                break;
1995        case IB_QPT_SMI:
1996        case IB_QPT_GSI:
1997        case IB_QPT_UD:
1998                svc_type = IB_CC_SVCTYPE_UD;
1999                break;
2000        default:
2001                ibp->rvp.n_pkt_drops++;
2002                return;
2003        }
2004
2005        sc5 = hfi1_9B_get_sc5(hdr, packet->rhf);
2006        sl = ibp->sc_to_sl[sc5];
2007        lqpn = qp->ibqp.qp_num;
2008
2009        process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type);
2010}
2011