linux/drivers/infiniband/hw/hfi1/rc.c
<<
>>
Prefs
   1/*
   2 * Copyright(c) 2015, 2016 Intel Corporation.
   3 *
   4 * This file is provided under a dual BSD/GPLv2 license.  When using or
   5 * redistributing this file, you may do so under either license.
   6 *
   7 * GPL LICENSE SUMMARY
   8 *
   9 * This program is free software; you can redistribute it and/or modify
  10 * it under the terms of version 2 of the GNU General Public License as
  11 * published by the Free Software Foundation.
  12 *
  13 * This program is distributed in the hope that it will be useful, but
  14 * WITHOUT ANY WARRANTY; without even the implied warranty of
  15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16 * General Public License for more details.
  17 *
  18 * BSD LICENSE
  19 *
  20 * Redistribution and use in source and binary forms, with or without
  21 * modification, are permitted provided that the following conditions
  22 * are met:
  23 *
  24 *  - Redistributions of source code must retain the above copyright
  25 *    notice, this list of conditions and the following disclaimer.
  26 *  - Redistributions in binary form must reproduce the above copyright
  27 *    notice, this list of conditions and the following disclaimer in
  28 *    the documentation and/or other materials provided with the
  29 *    distribution.
  30 *  - Neither the name of Intel Corporation nor the names of its
  31 *    contributors may be used to endorse or promote products derived
  32 *    from this software without specific prior written permission.
  33 *
  34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  45 *
  46 */
  47
  48#include <linux/io.h>
  49#include <rdma/rdma_vt.h>
  50#include <rdma/rdmavt_qp.h>
  51
  52#include "hfi.h"
  53#include "qp.h"
  54#include "verbs_txreq.h"
  55#include "trace.h"
  56
  57/* cut down ridiculously long IB macro names */
  58#define OP(x) RC_OP(x)
  59
  60/**
  61 * hfi1_add_retry_timer - add/start a retry timer
  62 * @qp - the QP
  63 *
  64 * add a retry timer on the QP
  65 */
  66static inline void hfi1_add_retry_timer(struct rvt_qp *qp)
  67{
  68        struct ib_qp *ibqp = &qp->ibqp;
  69        struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
  70
  71        lockdep_assert_held(&qp->s_lock);
  72        qp->s_flags |= RVT_S_TIMER;
  73        /* 4.096 usec. * (1 << qp->timeout) */
  74        qp->s_timer.expires = jiffies + qp->timeout_jiffies +
  75                              rdi->busy_jiffies;
  76        add_timer(&qp->s_timer);
  77}
  78
  79/**
  80 * hfi1_add_rnr_timer - add/start an rnr timer
  81 * @qp - the QP
  82 * @to - timeout in usecs
  83 *
  84 * add an rnr timer on the QP
  85 */
  86void hfi1_add_rnr_timer(struct rvt_qp *qp, u32 to)
  87{
  88        struct hfi1_qp_priv *priv = qp->priv;
  89
  90        lockdep_assert_held(&qp->s_lock);
  91        qp->s_flags |= RVT_S_WAIT_RNR;
  92        priv->s_rnr_timer.expires = jiffies + usecs_to_jiffies(to);
  93        add_timer(&priv->s_rnr_timer);
  94}
  95
  96/**
  97 * hfi1_mod_retry_timer - mod a retry timer
  98 * @qp - the QP
  99 *
 100 * Modify a potentially already running retry
 101 * timer
 102 */
 103static inline void hfi1_mod_retry_timer(struct rvt_qp *qp)
 104{
 105        struct ib_qp *ibqp = &qp->ibqp;
 106        struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
 107
 108        lockdep_assert_held(&qp->s_lock);
 109        qp->s_flags |= RVT_S_TIMER;
 110        /* 4.096 usec. * (1 << qp->timeout) */
 111        mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies +
 112                  rdi->busy_jiffies);
 113}
 114
 115/**
 116 * hfi1_stop_retry_timer - stop a retry timer
 117 * @qp - the QP
 118 *
 119 * stop a retry timer and return if the timer
 120 * had been pending.
 121 */
 122static inline int hfi1_stop_retry_timer(struct rvt_qp *qp)
 123{
 124        int rval = 0;
 125
 126        lockdep_assert_held(&qp->s_lock);
 127        /* Remove QP from retry */
 128        if (qp->s_flags & RVT_S_TIMER) {
 129                qp->s_flags &= ~RVT_S_TIMER;
 130                rval = del_timer(&qp->s_timer);
 131        }
 132        return rval;
 133}
 134
 135/**
 136 * hfi1_stop_rc_timers - stop all timers
 137 * @qp - the QP
 138 *
 139 * stop any pending timers
 140 */
 141void hfi1_stop_rc_timers(struct rvt_qp *qp)
 142{
 143        struct hfi1_qp_priv *priv = qp->priv;
 144
 145        lockdep_assert_held(&qp->s_lock);
 146        /* Remove QP from all timers */
 147        if (qp->s_flags & (RVT_S_TIMER | RVT_S_WAIT_RNR)) {
 148                qp->s_flags &= ~(RVT_S_TIMER | RVT_S_WAIT_RNR);
 149                del_timer(&qp->s_timer);
 150                del_timer(&priv->s_rnr_timer);
 151        }
 152}
 153
 154/**
 155 * hfi1_stop_rnr_timer - stop an rnr timer
 156 * @qp - the QP
 157 *
 158 * stop an rnr timer and return if the timer
 159 * had been pending.
 160 */
 161static inline int hfi1_stop_rnr_timer(struct rvt_qp *qp)
 162{
 163        int rval = 0;
 164        struct hfi1_qp_priv *priv = qp->priv;
 165
 166        lockdep_assert_held(&qp->s_lock);
 167        /* Remove QP from rnr timer */
 168        if (qp->s_flags & RVT_S_WAIT_RNR) {
 169                qp->s_flags &= ~RVT_S_WAIT_RNR;
 170                rval = del_timer(&priv->s_rnr_timer);
 171        }
 172        return rval;
 173}
 174
 175/**
 176 * hfi1_del_timers_sync - wait for any timeout routines to exit
 177 * @qp - the QP
 178 */
 179void hfi1_del_timers_sync(struct rvt_qp *qp)
 180{
 181        struct hfi1_qp_priv *priv = qp->priv;
 182
 183        del_timer_sync(&qp->s_timer);
 184        del_timer_sync(&priv->s_rnr_timer);
 185}
 186
 187static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
 188                       u32 psn, u32 pmtu)
 189{
 190        u32 len;
 191
 192        len = delta_psn(psn, wqe->psn) * pmtu;
 193        ss->sge = wqe->sg_list[0];
 194        ss->sg_list = wqe->sg_list + 1;
 195        ss->num_sge = wqe->wr.num_sge;
 196        ss->total_len = wqe->length;
 197        hfi1_skip_sge(ss, len, 0);
 198        return wqe->length - len;
 199}
 200
 201/**
 202 * make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
 203 * @dev: the device for this QP
 204 * @qp: a pointer to the QP
 205 * @ohdr: a pointer to the IB header being constructed
 206 * @ps: the xmit packet state
 207 *
 208 * Return 1 if constructed; otherwise, return 0.
 209 * Note that we are in the responder's side of the QP context.
 210 * Note the QP s_lock must be held.
 211 */
 212static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
 213                       struct ib_other_headers *ohdr,
 214                       struct hfi1_pkt_state *ps)
 215{
 216        struct rvt_ack_entry *e;
 217        u32 hwords;
 218        u32 len;
 219        u32 bth0;
 220        u32 bth2;
 221        int middle = 0;
 222        u32 pmtu = qp->pmtu;
 223        struct hfi1_qp_priv *priv = qp->priv;
 224
 225        lockdep_assert_held(&qp->s_lock);
 226        /* Don't send an ACK if we aren't supposed to. */
 227        if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
 228                goto bail;
 229
 230        /* header size in 32-bit words LRH+BTH = (8+12)/4. */
 231        hwords = 5;
 232
 233        switch (qp->s_ack_state) {
 234        case OP(RDMA_READ_RESPONSE_LAST):
 235        case OP(RDMA_READ_RESPONSE_ONLY):
 236                e = &qp->s_ack_queue[qp->s_tail_ack_queue];
 237                if (e->rdma_sge.mr) {
 238                        rvt_put_mr(e->rdma_sge.mr);
 239                        e->rdma_sge.mr = NULL;
 240                }
 241                /* FALLTHROUGH */
 242        case OP(ATOMIC_ACKNOWLEDGE):
 243                /*
 244                 * We can increment the tail pointer now that the last
 245                 * response has been sent instead of only being
 246                 * constructed.
 247                 */
 248                if (++qp->s_tail_ack_queue > HFI1_MAX_RDMA_ATOMIC)
 249                        qp->s_tail_ack_queue = 0;
 250                /* FALLTHROUGH */
 251        case OP(SEND_ONLY):
 252        case OP(ACKNOWLEDGE):
 253                /* Check for no next entry in the queue. */
 254                if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {
 255                        if (qp->s_flags & RVT_S_ACK_PENDING)
 256                                goto normal;
 257                        goto bail;
 258                }
 259
 260                e = &qp->s_ack_queue[qp->s_tail_ack_queue];
 261                if (e->opcode == OP(RDMA_READ_REQUEST)) {
 262                        /*
 263                         * If a RDMA read response is being resent and
 264                         * we haven't seen the duplicate request yet,
 265                         * then stop sending the remaining responses the
 266                         * responder has seen until the requester re-sends it.
 267                         */
 268                        len = e->rdma_sge.sge_length;
 269                        if (len && !e->rdma_sge.mr) {
 270                                qp->s_tail_ack_queue = qp->r_head_ack_queue;
 271                                goto bail;
 272                        }
 273                        /* Copy SGE state in case we need to resend */
 274                        ps->s_txreq->mr = e->rdma_sge.mr;
 275                        if (ps->s_txreq->mr)
 276                                rvt_get_mr(ps->s_txreq->mr);
 277                        qp->s_ack_rdma_sge.sge = e->rdma_sge;
 278                        qp->s_ack_rdma_sge.num_sge = 1;
 279                        qp->s_cur_sge = &qp->s_ack_rdma_sge;
 280                        if (len > pmtu) {
 281                                len = pmtu;
 282                                qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
 283                        } else {
 284                                qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
 285                                e->sent = 1;
 286                        }
 287                        ohdr->u.aeth = hfi1_compute_aeth(qp);
 288                        hwords++;
 289                        qp->s_ack_rdma_psn = e->psn;
 290                        bth2 = mask_psn(qp->s_ack_rdma_psn++);
 291                } else {
 292                        /* COMPARE_SWAP or FETCH_ADD */
 293                        qp->s_cur_sge = NULL;
 294                        len = 0;
 295                        qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
 296                        ohdr->u.at.aeth = hfi1_compute_aeth(qp);
 297                        ib_u64_put(e->atomic_data, &ohdr->u.at.atomic_ack_eth);
 298                        hwords += sizeof(ohdr->u.at) / sizeof(u32);
 299                        bth2 = mask_psn(e->psn);
 300                        e->sent = 1;
 301                }
 302                bth0 = qp->s_ack_state << 24;
 303                break;
 304
 305        case OP(RDMA_READ_RESPONSE_FIRST):
 306                qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
 307                /* FALLTHROUGH */
 308        case OP(RDMA_READ_RESPONSE_MIDDLE):
 309                qp->s_cur_sge = &qp->s_ack_rdma_sge;
 310                ps->s_txreq->mr = qp->s_ack_rdma_sge.sge.mr;
 311                if (ps->s_txreq->mr)
 312                        rvt_get_mr(ps->s_txreq->mr);
 313                len = qp->s_ack_rdma_sge.sge.sge_length;
 314                if (len > pmtu) {
 315                        len = pmtu;
 316                        middle = HFI1_CAP_IS_KSET(SDMA_AHG);
 317                } else {
 318                        ohdr->u.aeth = hfi1_compute_aeth(qp);
 319                        hwords++;
 320                        qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
 321                        e = &qp->s_ack_queue[qp->s_tail_ack_queue];
 322                        e->sent = 1;
 323                }
 324                bth0 = qp->s_ack_state << 24;
 325                bth2 = mask_psn(qp->s_ack_rdma_psn++);
 326                break;
 327
 328        default:
 329normal:
 330                /*
 331                 * Send a regular ACK.
 332                 * Set the s_ack_state so we wait until after sending
 333                 * the ACK before setting s_ack_state to ACKNOWLEDGE
 334                 * (see above).
 335                 */
 336                qp->s_ack_state = OP(SEND_ONLY);
 337                qp->s_flags &= ~RVT_S_ACK_PENDING;
 338                qp->s_cur_sge = NULL;
 339                if (qp->s_nak_state)
 340                        ohdr->u.aeth =
 341                                cpu_to_be32((qp->r_msn & HFI1_MSN_MASK) |
 342                                            (qp->s_nak_state <<
 343                                             HFI1_AETH_CREDIT_SHIFT));
 344                else
 345                        ohdr->u.aeth = hfi1_compute_aeth(qp);
 346                hwords++;
 347                len = 0;
 348                bth0 = OP(ACKNOWLEDGE) << 24;
 349                bth2 = mask_psn(qp->s_ack_psn);
 350        }
 351        qp->s_rdma_ack_cnt++;
 352        qp->s_hdrwords = hwords;
 353        ps->s_txreq->sde = priv->s_sde;
 354        qp->s_cur_size = len;
 355        hfi1_make_ruc_header(qp, ohdr, bth0, bth2, middle, ps);
 356        /* pbc */
 357        ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2;
 358        return 1;
 359
 360bail:
 361        qp->s_ack_state = OP(ACKNOWLEDGE);
 362        /*
 363         * Ensure s_rdma_ack_cnt changes are committed prior to resetting
 364         * RVT_S_RESP_PENDING
 365         */
 366        smp_wmb();
 367        qp->s_flags &= ~(RVT_S_RESP_PENDING
 368                                | RVT_S_ACK_PENDING
 369                                | RVT_S_AHG_VALID);
 370        return 0;
 371}
 372
 373/**
 374 * hfi1_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
 375 * @qp: a pointer to the QP
 376 *
 377 * Assumes s_lock is held.
 378 *
 379 * Return 1 if constructed; otherwise, return 0.
 380 */
 381int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 382{
 383        struct hfi1_qp_priv *priv = qp->priv;
 384        struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
 385        struct ib_other_headers *ohdr;
 386        struct rvt_sge_state *ss;
 387        struct rvt_swqe *wqe;
 388        /* header size in 32-bit words LRH+BTH = (8+12)/4. */
 389        u32 hwords = 5;
 390        u32 len;
 391        u32 bth0 = 0;
 392        u32 bth2;
 393        u32 pmtu = qp->pmtu;
 394        char newreq;
 395        int middle = 0;
 396        int delta;
 397
 398        lockdep_assert_held(&qp->s_lock);
 399        ps->s_txreq = get_txreq(ps->dev, qp);
 400        if (IS_ERR(ps->s_txreq))
 401                goto bail_no_tx;
 402
 403        ohdr = &ps->s_txreq->phdr.hdr.u.oth;
 404        if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
 405                ohdr = &ps->s_txreq->phdr.hdr.u.l.oth;
 406
 407        /* Sending responses has higher priority over sending requests. */
 408        if ((qp->s_flags & RVT_S_RESP_PENDING) &&
 409            make_rc_ack(dev, qp, ohdr, ps))
 410                return 1;
 411
 412        if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) {
 413                if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
 414                        goto bail;
 415                /* We are in the error state, flush the work request. */
 416                smp_read_barrier_depends(); /* see post_one_send() */
 417                if (qp->s_last == ACCESS_ONCE(qp->s_head))
 418                        goto bail;
 419                /* If DMAs are in progress, we can't flush immediately. */
 420                if (iowait_sdma_pending(&priv->s_iowait)) {
 421                        qp->s_flags |= RVT_S_WAIT_DMA;
 422                        goto bail;
 423                }
 424                clear_ahg(qp);
 425                wqe = rvt_get_swqe_ptr(qp, qp->s_last);
 426                hfi1_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
 427                        IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
 428                /* will get called again */
 429                goto done_free_tx;
 430        }
 431
 432        if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK))
 433                goto bail;
 434
 435        if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) {
 436                if (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) {
 437                        qp->s_flags |= RVT_S_WAIT_PSN;
 438                        goto bail;
 439                }
 440                qp->s_sending_psn = qp->s_psn;
 441                qp->s_sending_hpsn = qp->s_psn - 1;
 442        }
 443
 444        /* Send a request. */
 445        wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
 446        switch (qp->s_state) {
 447        default:
 448                if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK))
 449                        goto bail;
 450                /*
 451                 * Resend an old request or start a new one.
 452                 *
 453                 * We keep track of the current SWQE so that
 454                 * we don't reset the "furthest progress" state
 455                 * if we need to back up.
 456                 */
 457                newreq = 0;
 458                if (qp->s_cur == qp->s_tail) {
 459                        /* Check if send work queue is empty. */
 460                        if (qp->s_tail == qp->s_head) {
 461                                clear_ahg(qp);
 462                                goto bail;
 463                        }
 464                        /*
 465                         * If a fence is requested, wait for previous
 466                         * RDMA read and atomic operations to finish.
 467                         */
 468                        if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
 469                            qp->s_num_rd_atomic) {
 470                                qp->s_flags |= RVT_S_WAIT_FENCE;
 471                                goto bail;
 472                        }
 473                        /*
 474                         * Local operations are processed immediately
 475                         * after all prior requests have completed
 476                         */
 477                        if (wqe->wr.opcode == IB_WR_REG_MR ||
 478                            wqe->wr.opcode == IB_WR_LOCAL_INV) {
 479                                int local_ops = 0;
 480                                int err = 0;
 481
 482                                if (qp->s_last != qp->s_cur)
 483                                        goto bail;
 484                                if (++qp->s_cur == qp->s_size)
 485                                        qp->s_cur = 0;
 486                                if (++qp->s_tail == qp->s_size)
 487                                        qp->s_tail = 0;
 488                                if (!(wqe->wr.send_flags &
 489                                      RVT_SEND_COMPLETION_ONLY)) {
 490                                        err = rvt_invalidate_rkey(
 491                                                qp,
 492                                                wqe->wr.ex.invalidate_rkey);
 493                                        local_ops = 1;
 494                                }
 495                                hfi1_send_complete(qp, wqe,
 496                                                   err ? IB_WC_LOC_PROT_ERR
 497                                                       : IB_WC_SUCCESS);
 498                                if (local_ops)
 499                                        atomic_dec(&qp->local_ops_pending);
 500                                qp->s_hdrwords = 0;
 501                                goto done_free_tx;
 502                        }
 503
 504                        newreq = 1;
 505                        qp->s_psn = wqe->psn;
 506                }
 507                /*
 508                 * Note that we have to be careful not to modify the
 509                 * original work request since we may need to resend
 510                 * it.
 511                 */
 512                len = wqe->length;
 513                ss = &qp->s_sge;
 514                bth2 = mask_psn(qp->s_psn);
 515                switch (wqe->wr.opcode) {
 516                case IB_WR_SEND:
 517                case IB_WR_SEND_WITH_IMM:
 518                case IB_WR_SEND_WITH_INV:
 519                        /* If no credit, return. */
 520                        if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) &&
 521                            cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
 522                                qp->s_flags |= RVT_S_WAIT_SSN_CREDIT;
 523                                goto bail;
 524                        }
 525                        if (len > pmtu) {
 526                                qp->s_state = OP(SEND_FIRST);
 527                                len = pmtu;
 528                                break;
 529                        }
 530                        if (wqe->wr.opcode == IB_WR_SEND) {
 531                                qp->s_state = OP(SEND_ONLY);
 532                        } else if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
 533                                qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
 534                                /* Immediate data comes after the BTH */
 535                                ohdr->u.imm_data = wqe->wr.ex.imm_data;
 536                                hwords += 1;
 537                        } else {
 538                                qp->s_state = OP(SEND_ONLY_WITH_INVALIDATE);
 539                                /* Invalidate rkey comes after the BTH */
 540                                ohdr->u.ieth = cpu_to_be32(
 541                                                wqe->wr.ex.invalidate_rkey);
 542                                hwords += 1;
 543                        }
 544                        if (wqe->wr.send_flags & IB_SEND_SOLICITED)
 545                                bth0 |= IB_BTH_SOLICITED;
 546                        bth2 |= IB_BTH_REQ_ACK;
 547                        if (++qp->s_cur == qp->s_size)
 548                                qp->s_cur = 0;
 549                        break;
 550
 551                case IB_WR_RDMA_WRITE:
 552                        if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
 553                                qp->s_lsn++;
 554                        /* FALLTHROUGH */
 555                case IB_WR_RDMA_WRITE_WITH_IMM:
 556                        /* If no credit, return. */
 557                        if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) &&
 558                            cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
 559                                qp->s_flags |= RVT_S_WAIT_SSN_CREDIT;
 560                                goto bail;
 561                        }
 562                        put_ib_reth_vaddr(
 563                                wqe->rdma_wr.remote_addr,
 564                                &ohdr->u.rc.reth);
 565                        ohdr->u.rc.reth.rkey =
 566                                cpu_to_be32(wqe->rdma_wr.rkey);
 567                        ohdr->u.rc.reth.length = cpu_to_be32(len);
 568                        hwords += sizeof(struct ib_reth) / sizeof(u32);
 569                        if (len > pmtu) {
 570                                qp->s_state = OP(RDMA_WRITE_FIRST);
 571                                len = pmtu;
 572                                break;
 573                        }
 574                        if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
 575                                qp->s_state = OP(RDMA_WRITE_ONLY);
 576                        } else {
 577                                qp->s_state =
 578                                        OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
 579                                /* Immediate data comes after RETH */
 580                                ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
 581                                hwords += 1;
 582                                if (wqe->wr.send_flags & IB_SEND_SOLICITED)
 583                                        bth0 |= IB_BTH_SOLICITED;
 584                        }
 585                        bth2 |= IB_BTH_REQ_ACK;
 586                        if (++qp->s_cur == qp->s_size)
 587                                qp->s_cur = 0;
 588                        break;
 589
 590                case IB_WR_RDMA_READ:
 591                        /*
 592                         * Don't allow more operations to be started
 593                         * than the QP limits allow.
 594                         */
 595                        if (newreq) {
 596                                if (qp->s_num_rd_atomic >=
 597                                    qp->s_max_rd_atomic) {
 598                                        qp->s_flags |= RVT_S_WAIT_RDMAR;
 599                                        goto bail;
 600                                }
 601                                qp->s_num_rd_atomic++;
 602                                if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
 603                                        qp->s_lsn++;
 604                        }
 605                        put_ib_reth_vaddr(
 606                                wqe->rdma_wr.remote_addr,
 607                                &ohdr->u.rc.reth);
 608                        ohdr->u.rc.reth.rkey =
 609                                cpu_to_be32(wqe->rdma_wr.rkey);
 610                        ohdr->u.rc.reth.length = cpu_to_be32(len);
 611                        qp->s_state = OP(RDMA_READ_REQUEST);
 612                        hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
 613                        ss = NULL;
 614                        len = 0;
 615                        bth2 |= IB_BTH_REQ_ACK;
 616                        if (++qp->s_cur == qp->s_size)
 617                                qp->s_cur = 0;
 618                        break;
 619
 620                case IB_WR_ATOMIC_CMP_AND_SWP:
 621                case IB_WR_ATOMIC_FETCH_AND_ADD:
 622                        /*
 623                         * Don't allow more operations to be started
 624                         * than the QP limits allow.
 625                         */
 626                        if (newreq) {
 627                                if (qp->s_num_rd_atomic >=
 628                                    qp->s_max_rd_atomic) {
 629                                        qp->s_flags |= RVT_S_WAIT_RDMAR;
 630                                        goto bail;
 631                                }
 632                                qp->s_num_rd_atomic++;
 633                                if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
 634                                        qp->s_lsn++;
 635                        }
 636                        if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
 637                                qp->s_state = OP(COMPARE_SWAP);
 638                                put_ib_ateth_swap(wqe->atomic_wr.swap,
 639                                                  &ohdr->u.atomic_eth);
 640                                put_ib_ateth_compare(wqe->atomic_wr.compare_add,
 641                                                     &ohdr->u.atomic_eth);
 642                        } else {
 643                                qp->s_state = OP(FETCH_ADD);
 644                                put_ib_ateth_swap(wqe->atomic_wr.compare_add,
 645                                                  &ohdr->u.atomic_eth);
 646                                put_ib_ateth_compare(0, &ohdr->u.atomic_eth);
 647                        }
 648                        put_ib_ateth_vaddr(wqe->atomic_wr.remote_addr,
 649                                           &ohdr->u.atomic_eth);
 650                        ohdr->u.atomic_eth.rkey = cpu_to_be32(
 651                                wqe->atomic_wr.rkey);
 652                        hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);
 653                        ss = NULL;
 654                        len = 0;
 655                        bth2 |= IB_BTH_REQ_ACK;
 656                        if (++qp->s_cur == qp->s_size)
 657                                qp->s_cur = 0;
 658                        break;
 659
 660                default:
 661                        goto bail;
 662                }
 663                qp->s_sge.sge = wqe->sg_list[0];
 664                qp->s_sge.sg_list = wqe->sg_list + 1;
 665                qp->s_sge.num_sge = wqe->wr.num_sge;
 666                qp->s_sge.total_len = wqe->length;
 667                qp->s_len = wqe->length;
 668                if (newreq) {
 669                        qp->s_tail++;
 670                        if (qp->s_tail >= qp->s_size)
 671                                qp->s_tail = 0;
 672                }
 673                if (wqe->wr.opcode == IB_WR_RDMA_READ)
 674                        qp->s_psn = wqe->lpsn + 1;
 675                else
 676                        qp->s_psn++;
 677                break;
 678
 679        case OP(RDMA_READ_RESPONSE_FIRST):
 680                /*
 681                 * qp->s_state is normally set to the opcode of the
 682                 * last packet constructed for new requests and therefore
 683                 * is never set to RDMA read response.
 684                 * RDMA_READ_RESPONSE_FIRST is used by the ACK processing
 685                 * thread to indicate a SEND needs to be restarted from an
 686                 * earlier PSN without interfering with the sending thread.
 687                 * See restart_rc().
 688                 */
 689                qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
 690                /* FALLTHROUGH */
 691        case OP(SEND_FIRST):
 692                qp->s_state = OP(SEND_MIDDLE);
 693                /* FALLTHROUGH */
 694        case OP(SEND_MIDDLE):
 695                bth2 = mask_psn(qp->s_psn++);
 696                ss = &qp->s_sge;
 697                len = qp->s_len;
 698                if (len > pmtu) {
 699                        len = pmtu;
 700                        middle = HFI1_CAP_IS_KSET(SDMA_AHG);
 701                        break;
 702                }
 703                if (wqe->wr.opcode == IB_WR_SEND) {
 704                        qp->s_state = OP(SEND_LAST);
 705                } else if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
 706                        qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
 707                        /* Immediate data comes after the BTH */
 708                        ohdr->u.imm_data = wqe->wr.ex.imm_data;
 709                        hwords += 1;
 710                } else {
 711                        qp->s_state = OP(SEND_LAST_WITH_INVALIDATE);
 712                        /* invalidate data comes after the BTH */
 713                        ohdr->u.ieth = cpu_to_be32(wqe->wr.ex.invalidate_rkey);
 714                        hwords += 1;
 715                }
 716                if (wqe->wr.send_flags & IB_SEND_SOLICITED)
 717                        bth0 |= IB_BTH_SOLICITED;
 718                bth2 |= IB_BTH_REQ_ACK;
 719                qp->s_cur++;
 720                if (qp->s_cur >= qp->s_size)
 721                        qp->s_cur = 0;
 722                break;
 723
 724        case OP(RDMA_READ_RESPONSE_LAST):
 725                /*
 726                 * qp->s_state is normally set to the opcode of the
 727                 * last packet constructed for new requests and therefore
 728                 * is never set to RDMA read response.
 729                 * RDMA_READ_RESPONSE_LAST is used by the ACK processing
 730                 * thread to indicate a RDMA write needs to be restarted from
 731                 * an earlier PSN without interfering with the sending thread.
 732                 * See restart_rc().
 733                 */
 734                qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
 735                /* FALLTHROUGH */
 736        case OP(RDMA_WRITE_FIRST):
 737                qp->s_state = OP(RDMA_WRITE_MIDDLE);
 738                /* FALLTHROUGH */
 739        case OP(RDMA_WRITE_MIDDLE):
 740                bth2 = mask_psn(qp->s_psn++);
 741                ss = &qp->s_sge;
 742                len = qp->s_len;
 743                if (len > pmtu) {
 744                        len = pmtu;
 745                        middle = HFI1_CAP_IS_KSET(SDMA_AHG);
 746                        break;
 747                }
 748                if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
 749                        qp->s_state = OP(RDMA_WRITE_LAST);
 750                } else {
 751                        qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
 752                        /* Immediate data comes after the BTH */
 753                        ohdr->u.imm_data = wqe->wr.ex.imm_data;
 754                        hwords += 1;
 755                        if (wqe->wr.send_flags & IB_SEND_SOLICITED)
 756                                bth0 |= IB_BTH_SOLICITED;
 757                }
 758                bth2 |= IB_BTH_REQ_ACK;
 759                qp->s_cur++;
 760                if (qp->s_cur >= qp->s_size)
 761                        qp->s_cur = 0;
 762                break;
 763
 764        case OP(RDMA_READ_RESPONSE_MIDDLE):
 765                /*
 766                 * qp->s_state is normally set to the opcode of the
 767                 * last packet constructed for new requests and therefore
 768                 * is never set to RDMA read response.
 769                 * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing
 770                 * thread to indicate a RDMA read needs to be restarted from
 771                 * an earlier PSN without interfering with the sending thread.
 772                 * See restart_rc().
 773                 */
 774                len = (delta_psn(qp->s_psn, wqe->psn)) * pmtu;
 775                put_ib_reth_vaddr(
 776                        wqe->rdma_wr.remote_addr + len,
 777                        &ohdr->u.rc.reth);
 778                ohdr->u.rc.reth.rkey =
 779                        cpu_to_be32(wqe->rdma_wr.rkey);
 780                ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len);
 781                qp->s_state = OP(RDMA_READ_REQUEST);
 782                hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
 783                bth2 = mask_psn(qp->s_psn) | IB_BTH_REQ_ACK;
 784                qp->s_psn = wqe->lpsn + 1;
 785                ss = NULL;
 786                len = 0;
 787                qp->s_cur++;
 788                if (qp->s_cur == qp->s_size)
 789                        qp->s_cur = 0;
 790                break;
 791        }
 792        qp->s_sending_hpsn = bth2;
 793        delta = delta_psn(bth2, wqe->psn);
 794        if (delta && delta % HFI1_PSN_CREDIT == 0)
 795                bth2 |= IB_BTH_REQ_ACK;
 796        if (qp->s_flags & RVT_S_SEND_ONE) {
 797                qp->s_flags &= ~RVT_S_SEND_ONE;
 798                qp->s_flags |= RVT_S_WAIT_ACK;
 799                bth2 |= IB_BTH_REQ_ACK;
 800        }
 801        qp->s_len -= len;
 802        qp->s_hdrwords = hwords;
 803        ps->s_txreq->sde = priv->s_sde;
 804        qp->s_cur_sge = ss;
 805        qp->s_cur_size = len;
 806        hfi1_make_ruc_header(
 807                qp,
 808                ohdr,
 809                bth0 | (qp->s_state << 24),
 810                bth2,
 811                middle,
 812                ps);
 813        /* pbc */
 814        ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2;
 815        return 1;
 816
 817done_free_tx:
 818        hfi1_put_txreq(ps->s_txreq);
 819        ps->s_txreq = NULL;
 820        return 1;
 821
 822bail:
 823        hfi1_put_txreq(ps->s_txreq);
 824
 825bail_no_tx:
 826        ps->s_txreq = NULL;
 827        qp->s_flags &= ~RVT_S_BUSY;
 828        qp->s_hdrwords = 0;
 829        return 0;
 830}
 831
 832/**
 833 * hfi1_send_rc_ack - Construct an ACK packet and send it
 834 * @qp: a pointer to the QP
 835 *
 836 * This is called from hfi1_rc_rcv() and handle_receive_interrupt().
 837 * Note that RDMA reads and atomics are handled in the
 838 * send side QP state and send engine.
 839 */
 840void hfi1_send_rc_ack(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp,
 841                      int is_fecn)
 842{
 843        struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
 844        struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
 845        u64 pbc, pbc_flags = 0;
 846        u16 lrh0;
 847        u16 sc5;
 848        u32 bth0;
 849        u32 hwords;
 850        u32 vl, plen;
 851        struct send_context *sc;
 852        struct pio_buf *pbuf;
 853        struct ib_header hdr;
 854        struct ib_other_headers *ohdr;
 855        unsigned long flags;
 856
 857        /* Don't send ACK or NAK if a RDMA read or atomic is pending. */
 858        if (qp->s_flags & RVT_S_RESP_PENDING)
 859                goto queue_ack;
 860
 861        /* Ensure s_rdma_ack_cnt changes are committed */
 862        smp_read_barrier_depends();
 863        if (qp->s_rdma_ack_cnt)
 864                goto queue_ack;
 865
 866        /* Construct the header */
 867        /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4 */
 868        hwords = 6;
 869        if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
 870                hwords += hfi1_make_grh(ibp, &hdr.u.l.grh,
 871                                       &qp->remote_ah_attr.grh, hwords, 0);
 872                ohdr = &hdr.u.l.oth;
 873                lrh0 = HFI1_LRH_GRH;
 874        } else {
 875                ohdr = &hdr.u.oth;
 876                lrh0 = HFI1_LRH_BTH;
 877        }
 878        /* read pkey_index w/o lock (its atomic) */
 879        bth0 = hfi1_get_pkey(ibp, qp->s_pkey_index) | (OP(ACKNOWLEDGE) << 24);
 880        if (qp->s_mig_state == IB_MIG_MIGRATED)
 881                bth0 |= IB_BTH_MIG_REQ;
 882        if (qp->r_nak_state)
 883                ohdr->u.aeth = cpu_to_be32((qp->r_msn & HFI1_MSN_MASK) |
 884                                            (qp->r_nak_state <<
 885                                             HFI1_AETH_CREDIT_SHIFT));
 886        else
 887                ohdr->u.aeth = hfi1_compute_aeth(qp);
 888        sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
 889        /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
 890        pbc_flags |= ((!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT);
 891        lrh0 |= (sc5 & 0xf) << 12 | (qp->remote_ah_attr.sl & 0xf) << 4;
 892        hdr.lrh[0] = cpu_to_be16(lrh0);
 893        hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
 894        hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
 895        hdr.lrh[3] = cpu_to_be16(ppd->lid | qp->remote_ah_attr.src_path_bits);
 896        ohdr->bth[0] = cpu_to_be32(bth0);
 897        ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
 898        ohdr->bth[1] |= cpu_to_be32((!!is_fecn) << HFI1_BECN_SHIFT);
 899        ohdr->bth[2] = cpu_to_be32(mask_psn(qp->r_ack_psn));
 900
 901        /* Don't try to send ACKs if the link isn't ACTIVE */
 902        if (driver_lstate(ppd) != IB_PORT_ACTIVE)
 903                return;
 904
 905        sc = rcd->sc;
 906        plen = 2 /* PBC */ + hwords;
 907        vl = sc_to_vlt(ppd->dd, sc5);
 908        pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
 909
 910        pbuf = sc_buffer_alloc(sc, plen, NULL, NULL);
 911        if (!pbuf) {
 912                /*
 913                 * We have no room to send at the moment.  Pass
 914                 * responsibility for sending the ACK to the send engine
 915                 * so that when enough buffer space becomes available,
 916                 * the ACK is sent ahead of other outgoing packets.
 917                 */
 918                goto queue_ack;
 919        }
 920
 921        trace_ack_output_ibhdr(dd_from_ibdev(qp->ibqp.device), &hdr);
 922
 923        /* write the pbc and data */
 924        ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc, &hdr, hwords);
 925
 926        return;
 927
 928queue_ack:
 929        spin_lock_irqsave(&qp->s_lock, flags);
 930        if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
 931                goto unlock;
 932        this_cpu_inc(*ibp->rvp.rc_qacks);
 933        qp->s_flags |= RVT_S_ACK_PENDING | RVT_S_RESP_PENDING;
 934        qp->s_nak_state = qp->r_nak_state;
 935        qp->s_ack_psn = qp->r_ack_psn;
 936        if (is_fecn)
 937                qp->s_flags |= RVT_S_ECN;
 938
 939        /* Schedule the send engine. */
 940        hfi1_schedule_send(qp);
 941unlock:
 942        spin_unlock_irqrestore(&qp->s_lock, flags);
 943}
 944
 945/**
 946 * reset_psn - reset the QP state to send starting from PSN
 947 * @qp: the QP
 948 * @psn: the packet sequence number to restart at
 949 *
 950 * This is called from hfi1_rc_rcv() to process an incoming RC ACK
 951 * for the given QP.
 952 * Called at interrupt level with the QP s_lock held.
 953 */
 954static void reset_psn(struct rvt_qp *qp, u32 psn)
 955{
 956        u32 n = qp->s_acked;
 957        struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n);
 958        u32 opcode;
 959
 960        lockdep_assert_held(&qp->s_lock);
 961        qp->s_cur = n;
 962
 963        /*
 964         * If we are starting the request from the beginning,
 965         * let the normal send code handle initialization.
 966         */
 967        if (cmp_psn(psn, wqe->psn) <= 0) {
 968                qp->s_state = OP(SEND_LAST);
 969                goto done;
 970        }
 971
 972        /* Find the work request opcode corresponding to the given PSN. */
 973        opcode = wqe->wr.opcode;
 974        for (;;) {
 975                int diff;
 976
 977                if (++n == qp->s_size)
 978                        n = 0;
 979                if (n == qp->s_tail)
 980                        break;
 981                wqe = rvt_get_swqe_ptr(qp, n);
 982                diff = cmp_psn(psn, wqe->psn);
 983                if (diff < 0)
 984                        break;
 985                qp->s_cur = n;
 986                /*
 987                 * If we are starting the request from the beginning,
 988                 * let the normal send code handle initialization.
 989                 */
 990                if (diff == 0) {
 991                        qp->s_state = OP(SEND_LAST);
 992                        goto done;
 993                }
 994                opcode = wqe->wr.opcode;
 995        }
 996
 997        /*
 998         * Set the state to restart in the middle of a request.
 999         * Don't change the s_sge, s_cur_sge, or s_cur_size.
1000         * See hfi1_make_rc_req().
1001         */
1002        switch (opcode) {
1003        case IB_WR_SEND:
1004        case IB_WR_SEND_WITH_IMM:
1005                qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
1006                break;
1007
1008        case IB_WR_RDMA_WRITE:
1009        case IB_WR_RDMA_WRITE_WITH_IMM:
1010                qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
1011                break;
1012
1013        case IB_WR_RDMA_READ:
1014                qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
1015                break;
1016
1017        default:
1018                /*
1019                 * This case shouldn't happen since its only
1020                 * one PSN per req.
1021                 */
1022                qp->s_state = OP(SEND_LAST);
1023        }
1024done:
1025        qp->s_psn = psn;
1026        /*
1027         * Set RVT_S_WAIT_PSN as rc_complete() may start the timer
1028         * asynchronously before the send engine can get scheduled.
1029         * Doing it in hfi1_make_rc_req() is too late.
1030         */
1031        if ((cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) &&
1032            (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0))
1033                qp->s_flags |= RVT_S_WAIT_PSN;
1034        qp->s_flags &= ~RVT_S_AHG_VALID;
1035}
1036
1037/*
1038 * Back up requester to resend the last un-ACKed request.
1039 * The QP r_lock and s_lock should be held and interrupts disabled.
1040 */
1041static void restart_rc(struct rvt_qp *qp, u32 psn, int wait)
1042{
1043        struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1044        struct hfi1_ibport *ibp;
1045
1046        lockdep_assert_held(&qp->r_lock);
1047        lockdep_assert_held(&qp->s_lock);
1048        if (qp->s_retry == 0) {
1049                if (qp->s_mig_state == IB_MIG_ARMED) {
1050                        hfi1_migrate_qp(qp);
1051                        qp->s_retry = qp->s_retry_cnt;
1052                } else if (qp->s_last == qp->s_acked) {
1053                        hfi1_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
1054                        rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
1055                        return;
1056                } else { /* need to handle delayed completion */
1057                        return;
1058                }
1059        } else {
1060                qp->s_retry--;
1061        }
1062
1063        ibp = to_iport(qp->ibqp.device, qp->port_num);
1064        if (wqe->wr.opcode == IB_WR_RDMA_READ)
1065                ibp->rvp.n_rc_resends++;
1066        else
1067                ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
1068
1069        qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR |
1070                         RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN |
1071                         RVT_S_WAIT_ACK);
1072        if (wait)
1073                qp->s_flags |= RVT_S_SEND_ONE;
1074        reset_psn(qp, psn);
1075}
1076
1077/*
1078 * This is called from s_timer for missing responses.
1079 */
1080void hfi1_rc_timeout(unsigned long arg)
1081{
1082        struct rvt_qp *qp = (struct rvt_qp *)arg;
1083        struct hfi1_ibport *ibp;
1084        unsigned long flags;
1085
1086        spin_lock_irqsave(&qp->r_lock, flags);
1087        spin_lock(&qp->s_lock);
1088        if (qp->s_flags & RVT_S_TIMER) {
1089                ibp = to_iport(qp->ibqp.device, qp->port_num);
1090                ibp->rvp.n_rc_timeouts++;
1091                qp->s_flags &= ~RVT_S_TIMER;
1092                del_timer(&qp->s_timer);
1093                trace_hfi1_timeout(qp, qp->s_last_psn + 1);
1094                restart_rc(qp, qp->s_last_psn + 1, 1);
1095                hfi1_schedule_send(qp);
1096        }
1097        spin_unlock(&qp->s_lock);
1098        spin_unlock_irqrestore(&qp->r_lock, flags);
1099}
1100
1101/*
1102 * This is called from s_timer for RNR timeouts.
1103 */
1104void hfi1_rc_rnr_retry(unsigned long arg)
1105{
1106        struct rvt_qp *qp = (struct rvt_qp *)arg;
1107        unsigned long flags;
1108
1109        spin_lock_irqsave(&qp->s_lock, flags);
1110        hfi1_stop_rnr_timer(qp);
1111        hfi1_schedule_send(qp);
1112        spin_unlock_irqrestore(&qp->s_lock, flags);
1113}
1114
1115/*
1116 * Set qp->s_sending_psn to the next PSN after the given one.
1117 * This would be psn+1 except when RDMA reads are present.
1118 */
1119static void reset_sending_psn(struct rvt_qp *qp, u32 psn)
1120{
1121        struct rvt_swqe *wqe;
1122        u32 n = qp->s_last;
1123
1124        lockdep_assert_held(&qp->s_lock);
1125        /* Find the work request corresponding to the given PSN. */
1126        for (;;) {
1127                wqe = rvt_get_swqe_ptr(qp, n);
1128                if (cmp_psn(psn, wqe->lpsn) <= 0) {
1129                        if (wqe->wr.opcode == IB_WR_RDMA_READ)
1130                                qp->s_sending_psn = wqe->lpsn + 1;
1131                        else
1132                                qp->s_sending_psn = psn + 1;
1133                        break;
1134                }
1135                if (++n == qp->s_size)
1136                        n = 0;
1137                if (n == qp->s_tail)
1138                        break;
1139        }
1140}
1141
1142/*
1143 * This should be called with the QP s_lock held and interrupts disabled.
1144 */
1145void hfi1_rc_send_complete(struct rvt_qp *qp, struct ib_header *hdr)
1146{
1147        struct ib_other_headers *ohdr;
1148        struct rvt_swqe *wqe;
1149        struct ib_wc wc;
1150        unsigned i;
1151        u32 opcode;
1152        u32 psn;
1153
1154        lockdep_assert_held(&qp->s_lock);
1155        if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND))
1156                return;
1157
1158        /* Find out where the BTH is */
1159        if ((be16_to_cpu(hdr->lrh[0]) & 3) == HFI1_LRH_BTH)
1160                ohdr = &hdr->u.oth;
1161        else
1162                ohdr = &hdr->u.l.oth;
1163
1164        opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
1165        if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
1166            opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
1167                WARN_ON(!qp->s_rdma_ack_cnt);
1168                qp->s_rdma_ack_cnt--;
1169                return;
1170        }
1171
1172        psn = be32_to_cpu(ohdr->bth[2]);
1173        reset_sending_psn(qp, psn);
1174
1175        /*
1176         * Start timer after a packet requesting an ACK has been sent and
1177         * there are still requests that haven't been acked.
1178         */
1179        if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail &&
1180            !(qp->s_flags &
1181                (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
1182                (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
1183                hfi1_add_retry_timer(qp);
1184
1185        while (qp->s_last != qp->s_acked) {
1186                u32 s_last;
1187
1188                wqe = rvt_get_swqe_ptr(qp, qp->s_last);
1189                if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 &&
1190                    cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
1191                        break;
1192                s_last = qp->s_last;
1193                if (++s_last >= qp->s_size)
1194                        s_last = 0;
1195                qp->s_last = s_last;
1196                /* see post_send() */
1197                barrier();
1198                for (i = 0; i < wqe->wr.num_sge; i++) {
1199                        struct rvt_sge *sge = &wqe->sg_list[i];
1200
1201                        rvt_put_mr(sge->mr);
1202                }
1203                /* Post a send completion queue entry if requested. */
1204                if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) ||
1205                    (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
1206                        memset(&wc, 0, sizeof(wc));
1207                        wc.wr_id = wqe->wr.wr_id;
1208                        wc.status = IB_WC_SUCCESS;
1209                        wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode];
1210                        wc.byte_len = wqe->length;
1211                        wc.qp = &qp->ibqp;
1212                        rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc, 0);
1213                }
1214        }
1215        /*
1216         * If we were waiting for sends to complete before re-sending,
1217         * and they are now complete, restart sending.
1218         */
1219        trace_hfi1_sendcomplete(qp, psn);
1220        if (qp->s_flags & RVT_S_WAIT_PSN &&
1221            cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
1222                qp->s_flags &= ~RVT_S_WAIT_PSN;
1223                qp->s_sending_psn = qp->s_psn;
1224                qp->s_sending_hpsn = qp->s_psn - 1;
1225                hfi1_schedule_send(qp);
1226        }
1227}
1228
1229static inline void update_last_psn(struct rvt_qp *qp, u32 psn)
1230{
1231        qp->s_last_psn = psn;
1232}
1233
1234/*
1235 * Generate a SWQE completion.
1236 * This is similar to hfi1_send_complete but has to check to be sure
1237 * that the SGEs are not being referenced if the SWQE is being resent.
1238 */
1239static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
1240                                         struct rvt_swqe *wqe,
1241                                         struct hfi1_ibport *ibp)
1242{
1243        struct ib_wc wc;
1244        unsigned i;
1245
1246        lockdep_assert_held(&qp->s_lock);
1247        /*
1248         * Don't decrement refcount and don't generate a
1249         * completion if the SWQE is being resent until the send
1250         * is finished.
1251         */
1252        if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 ||
1253            cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
1254                u32 s_last;
1255
1256                for (i = 0; i < wqe->wr.num_sge; i++) {
1257                        struct rvt_sge *sge = &wqe->sg_list[i];
1258
1259                        rvt_put_mr(sge->mr);
1260                }
1261                s_last = qp->s_last;
1262                if (++s_last >= qp->s_size)
1263                        s_last = 0;
1264                qp->s_last = s_last;
1265                /* see post_send() */
1266                barrier();
1267                /* Post a send completion queue entry if requested. */
1268                if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) ||
1269                    (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
1270                        memset(&wc, 0, sizeof(wc));
1271                        wc.wr_id = wqe->wr.wr_id;
1272                        wc.status = IB_WC_SUCCESS;
1273                        wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode];
1274                        wc.byte_len = wqe->length;
1275                        wc.qp = &qp->ibqp;
1276                        rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc, 0);
1277                }
1278        } else {
1279                struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
1280
1281                this_cpu_inc(*ibp->rvp.rc_delayed_comp);
1282                /*
1283                 * If send progress not running attempt to progress
1284                 * SDMA queue.
1285                 */
1286                if (ppd->dd->flags & HFI1_HAS_SEND_DMA) {
1287                        struct sdma_engine *engine;
1288                        u8 sc5;
1289
1290                        /* For now use sc to find engine */
1291                        sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
1292                        engine = qp_to_sdma_engine(qp, sc5);
1293                        sdma_engine_progress_schedule(engine);
1294                }
1295        }
1296
1297        qp->s_retry = qp->s_retry_cnt;
1298        update_last_psn(qp, wqe->lpsn);
1299
1300        /*
1301         * If we are completing a request which is in the process of
1302         * being resent, we can stop re-sending it since we know the
1303         * responder has already seen it.
1304         */
1305        if (qp->s_acked == qp->s_cur) {
1306                if (++qp->s_cur >= qp->s_size)
1307                        qp->s_cur = 0;
1308                qp->s_acked = qp->s_cur;
1309                wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
1310                if (qp->s_acked != qp->s_tail) {
1311                        qp->s_state = OP(SEND_LAST);
1312                        qp->s_psn = wqe->psn;
1313                }
1314        } else {
1315                if (++qp->s_acked >= qp->s_size)
1316                        qp->s_acked = 0;
1317                if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur)
1318                        qp->s_draining = 0;
1319                wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1320        }
1321        return wqe;
1322}
1323
1324/**
1325 * do_rc_ack - process an incoming RC ACK
1326 * @qp: the QP the ACK came in on
1327 * @psn: the packet sequence number of the ACK
1328 * @opcode: the opcode of the request that resulted in the ACK
1329 *
1330 * This is called from rc_rcv_resp() to process an incoming RC ACK
1331 * for the given QP.
1332 * May be called at interrupt level, with the QP s_lock held.
1333 * Returns 1 if OK, 0 if current operation should be aborted (NAK).
1334 */
1335static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
1336                     u64 val, struct hfi1_ctxtdata *rcd)
1337{
1338        struct hfi1_ibport *ibp;
1339        enum ib_wc_status status;
1340        struct rvt_swqe *wqe;
1341        int ret = 0;
1342        u32 ack_psn;
1343        int diff;
1344        unsigned long to;
1345
1346        lockdep_assert_held(&qp->s_lock);
1347        /*
1348         * Note that NAKs implicitly ACK outstanding SEND and RDMA write
1349         * requests and implicitly NAK RDMA read and atomic requests issued
1350         * before the NAK'ed request.  The MSN won't include the NAK'ed
1351         * request but will include an ACK'ed request(s).
1352         */
1353        ack_psn = psn;
1354        if (aeth >> 29)
1355                ack_psn--;
1356        wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1357        ibp = to_iport(qp->ibqp.device, qp->port_num);
1358
1359        /*
1360         * The MSN might be for a later WQE than the PSN indicates so
1361         * only complete WQEs that the PSN finishes.
1362         */
1363        while ((diff = delta_psn(ack_psn, wqe->lpsn)) >= 0) {
1364                /*
1365                 * RDMA_READ_RESPONSE_ONLY is a special case since
1366                 * we want to generate completion events for everything
1367                 * before the RDMA read, copy the data, then generate
1368                 * the completion for the read.
1369                 */
1370                if (wqe->wr.opcode == IB_WR_RDMA_READ &&
1371                    opcode == OP(RDMA_READ_RESPONSE_ONLY) &&
1372                    diff == 0) {
1373                        ret = 1;
1374                        goto bail_stop;
1375                }
1376                /*
1377                 * If this request is a RDMA read or atomic, and the ACK is
1378                 * for a later operation, this ACK NAKs the RDMA read or
1379                 * atomic.  In other words, only a RDMA_READ_LAST or ONLY
1380                 * can ACK a RDMA read and likewise for atomic ops.  Note
1381                 * that the NAK case can only happen if relaxed ordering is
1382                 * used and requests are sent after an RDMA read or atomic
1383                 * is sent but before the response is received.
1384                 */
1385                if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
1386                     (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
1387                    ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1388                      wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
1389                     (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
1390                        /* Retry this request. */
1391                        if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
1392                                qp->r_flags |= RVT_R_RDMAR_SEQ;
1393                                restart_rc(qp, qp->s_last_psn + 1, 0);
1394                                if (list_empty(&qp->rspwait)) {
1395                                        qp->r_flags |= RVT_R_RSP_SEND;
1396                                        rvt_get_qp(qp);
1397                                        list_add_tail(&qp->rspwait,
1398                                                      &rcd->qp_wait_list);
1399                                }
1400                        }
1401                        /*
1402                         * No need to process the ACK/NAK since we are
1403                         * restarting an earlier request.
1404                         */
1405                        goto bail_stop;
1406                }
1407                if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1408                    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
1409                        u64 *vaddr = wqe->sg_list[0].vaddr;
1410                        *vaddr = val;
1411                }
1412                if (qp->s_num_rd_atomic &&
1413                    (wqe->wr.opcode == IB_WR_RDMA_READ ||
1414                     wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1415                     wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {
1416                        qp->s_num_rd_atomic--;
1417                        /* Restart sending task if fence is complete */
1418                        if ((qp->s_flags & RVT_S_WAIT_FENCE) &&
1419                            !qp->s_num_rd_atomic) {
1420                                qp->s_flags &= ~(RVT_S_WAIT_FENCE |
1421                                                 RVT_S_WAIT_ACK);
1422                                hfi1_schedule_send(qp);
1423                        } else if (qp->s_flags & RVT_S_WAIT_RDMAR) {
1424                                qp->s_flags &= ~(RVT_S_WAIT_RDMAR |
1425                                                 RVT_S_WAIT_ACK);
1426                                hfi1_schedule_send(qp);
1427                        }
1428                }
1429                wqe = do_rc_completion(qp, wqe, ibp);
1430                if (qp->s_acked == qp->s_tail)
1431                        break;
1432        }
1433
1434        switch (aeth >> 29) {
1435        case 0:         /* ACK */
1436                this_cpu_inc(*ibp->rvp.rc_acks);
1437                if (qp->s_acked != qp->s_tail) {
1438                        /*
1439                         * We are expecting more ACKs so
1440                         * mod the retry timer.
1441                         */
1442                        hfi1_mod_retry_timer(qp);
1443                        /*
1444                         * We can stop re-sending the earlier packets and
1445                         * continue with the next packet the receiver wants.
1446                         */
1447                        if (cmp_psn(qp->s_psn, psn) <= 0)
1448                                reset_psn(qp, psn + 1);
1449                } else {
1450                        /* No more acks - kill all timers */
1451                        hfi1_stop_rc_timers(qp);
1452                        if (cmp_psn(qp->s_psn, psn) <= 0) {
1453                                qp->s_state = OP(SEND_LAST);
1454                                qp->s_psn = psn + 1;
1455                        }
1456                }
1457                if (qp->s_flags & RVT_S_WAIT_ACK) {
1458                        qp->s_flags &= ~RVT_S_WAIT_ACK;
1459                        hfi1_schedule_send(qp);
1460                }
1461                hfi1_get_credit(qp, aeth);
1462                qp->s_rnr_retry = qp->s_rnr_retry_cnt;
1463                qp->s_retry = qp->s_retry_cnt;
1464                update_last_psn(qp, psn);
1465                return 1;
1466
1467        case 1:         /* RNR NAK */
1468                ibp->rvp.n_rnr_naks++;
1469                if (qp->s_acked == qp->s_tail)
1470                        goto bail_stop;
1471                if (qp->s_flags & RVT_S_WAIT_RNR)
1472                        goto bail_stop;
1473                if (qp->s_rnr_retry == 0) {
1474                        status = IB_WC_RNR_RETRY_EXC_ERR;
1475                        goto class_b;
1476                }
1477                if (qp->s_rnr_retry_cnt < 7)
1478                        qp->s_rnr_retry--;
1479
1480                /* The last valid PSN is the previous PSN. */
1481                update_last_psn(qp, psn - 1);
1482
1483                ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
1484
1485                reset_psn(qp, psn);
1486
1487                qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK);
1488                hfi1_stop_rc_timers(qp);
1489                to =
1490                        ib_hfi1_rnr_table[(aeth >> HFI1_AETH_CREDIT_SHIFT) &
1491                                           HFI1_AETH_CREDIT_MASK];
1492                hfi1_add_rnr_timer(qp, to);
1493                return 0;
1494
1495        case 3:         /* NAK */
1496                if (qp->s_acked == qp->s_tail)
1497                        goto bail_stop;
1498                /* The last valid PSN is the previous PSN. */
1499                update_last_psn(qp, psn - 1);
1500                switch ((aeth >> HFI1_AETH_CREDIT_SHIFT) &
1501                        HFI1_AETH_CREDIT_MASK) {
1502                case 0: /* PSN sequence error */
1503                        ibp->rvp.n_seq_naks++;
1504                        /*
1505                         * Back up to the responder's expected PSN.
1506                         * Note that we might get a NAK in the middle of an
1507                         * RDMA READ response which terminates the RDMA
1508                         * READ.
1509                         */
1510                        restart_rc(qp, psn, 0);
1511                        hfi1_schedule_send(qp);
1512                        break;
1513
1514                case 1: /* Invalid Request */
1515                        status = IB_WC_REM_INV_REQ_ERR;
1516                        ibp->rvp.n_other_naks++;
1517                        goto class_b;
1518
1519                case 2: /* Remote Access Error */
1520                        status = IB_WC_REM_ACCESS_ERR;
1521                        ibp->rvp.n_other_naks++;
1522                        goto class_b;
1523
1524                case 3: /* Remote Operation Error */
1525                        status = IB_WC_REM_OP_ERR;
1526                        ibp->rvp.n_other_naks++;
1527class_b:
1528                        if (qp->s_last == qp->s_acked) {
1529                                hfi1_send_complete(qp, wqe, status);
1530                                rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
1531                        }
1532                        break;
1533
1534                default:
1535                        /* Ignore other reserved NAK error codes */
1536                        goto reserved;
1537                }
1538                qp->s_retry = qp->s_retry_cnt;
1539                qp->s_rnr_retry = qp->s_rnr_retry_cnt;
1540                goto bail_stop;
1541
1542        default:                /* 2: reserved */
1543reserved:
1544                /* Ignore reserved NAK codes. */
1545                goto bail_stop;
1546        }
1547        /* cannot be reached  */
1548bail_stop:
1549        hfi1_stop_rc_timers(qp);
1550        return ret;
1551}
1552
1553/*
1554 * We have seen an out of sequence RDMA read middle or last packet.
1555 * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE.
1556 */
1557static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn,
1558                         struct hfi1_ctxtdata *rcd)
1559{
1560        struct rvt_swqe *wqe;
1561
1562        lockdep_assert_held(&qp->s_lock);
1563        /* Remove QP from retry timer */
1564        hfi1_stop_rc_timers(qp);
1565
1566        wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1567
1568        while (cmp_psn(psn, wqe->lpsn) > 0) {
1569                if (wqe->wr.opcode == IB_WR_RDMA_READ ||
1570                    wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
1571                    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
1572                        break;
1573                wqe = do_rc_completion(qp, wqe, ibp);
1574        }
1575
1576        ibp->rvp.n_rdma_seq++;
1577        qp->r_flags |= RVT_R_RDMAR_SEQ;
1578        restart_rc(qp, qp->s_last_psn + 1, 0);
1579        if (list_empty(&qp->rspwait)) {
1580                qp->r_flags |= RVT_R_RSP_SEND;
1581                rvt_get_qp(qp);
1582                list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
1583        }
1584}
1585
1586/**
1587 * rc_rcv_resp - process an incoming RC response packet
1588 * @ibp: the port this packet came in on
1589 * @ohdr: the other headers for this packet
1590 * @data: the packet data
1591 * @tlen: the packet length
1592 * @qp: the QP for this packet
1593 * @opcode: the opcode for this packet
1594 * @psn: the packet sequence number for this packet
1595 * @hdrsize: the header length
1596 * @pmtu: the path MTU
1597 *
1598 * This is called from hfi1_rc_rcv() to process an incoming RC response
1599 * packet for the given QP.
1600 * Called at interrupt level.
1601 */
1602static void rc_rcv_resp(struct hfi1_ibport *ibp,
1603                        struct ib_other_headers *ohdr,
1604                        void *data, u32 tlen, struct rvt_qp *qp,
1605                        u32 opcode, u32 psn, u32 hdrsize, u32 pmtu,
1606                        struct hfi1_ctxtdata *rcd)
1607{
1608        struct rvt_swqe *wqe;
1609        enum ib_wc_status status;
1610        unsigned long flags;
1611        int diff;
1612        u32 pad;
1613        u32 aeth;
1614        u64 val;
1615
1616        spin_lock_irqsave(&qp->s_lock, flags);
1617
1618        trace_hfi1_ack(qp, psn);
1619
1620        /* Ignore invalid responses. */
1621        smp_read_barrier_depends(); /* see post_one_send */
1622        if (cmp_psn(psn, ACCESS_ONCE(qp->s_next_psn)) >= 0)
1623                goto ack_done;
1624
1625        /* Ignore duplicate responses. */
1626        diff = cmp_psn(psn, qp->s_last_psn);
1627        if (unlikely(diff <= 0)) {
1628                /* Update credits for "ghost" ACKs */
1629                if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
1630                        aeth = be32_to_cpu(ohdr->u.aeth);
1631                        if ((aeth >> 29) == 0)
1632                                hfi1_get_credit(qp, aeth);
1633                }
1634                goto ack_done;
1635        }
1636
1637        /*
1638         * Skip everything other than the PSN we expect, if we are waiting
1639         * for a reply to a restarted RDMA read or atomic op.
1640         */
1641        if (qp->r_flags & RVT_R_RDMAR_SEQ) {
1642                if (cmp_psn(psn, qp->s_last_psn + 1) != 0)
1643                        goto ack_done;
1644                qp->r_flags &= ~RVT_R_RDMAR_SEQ;
1645        }
1646
1647        if (unlikely(qp->s_acked == qp->s_tail))
1648                goto ack_done;
1649        wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1650        status = IB_WC_SUCCESS;
1651
1652        switch (opcode) {
1653        case OP(ACKNOWLEDGE):
1654        case OP(ATOMIC_ACKNOWLEDGE):
1655        case OP(RDMA_READ_RESPONSE_FIRST):
1656                aeth = be32_to_cpu(ohdr->u.aeth);
1657                if (opcode == OP(ATOMIC_ACKNOWLEDGE))
1658                        val = ib_u64_get(&ohdr->u.at.atomic_ack_eth);
1659                else
1660                        val = 0;
1661                if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) ||
1662                    opcode != OP(RDMA_READ_RESPONSE_FIRST))
1663                        goto ack_done;
1664                wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1665                if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
1666                        goto ack_op_err;
1667                /*
1668                 * If this is a response to a resent RDMA read, we
1669                 * have to be careful to copy the data to the right
1670                 * location.
1671                 */
1672                qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
1673                                                  wqe, psn, pmtu);
1674                goto read_middle;
1675
1676        case OP(RDMA_READ_RESPONSE_MIDDLE):
1677                /* no AETH, no ACK */
1678                if (unlikely(cmp_psn(psn, qp->s_last_psn + 1)))
1679                        goto ack_seq_err;
1680                if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
1681                        goto ack_op_err;
1682read_middle:
1683                if (unlikely(tlen != (hdrsize + pmtu + 4)))
1684                        goto ack_len_err;
1685                if (unlikely(pmtu >= qp->s_rdma_read_len))
1686                        goto ack_len_err;
1687
1688                /*
1689                 * We got a response so update the timeout.
1690                 * 4.096 usec. * (1 << qp->timeout)
1691                 */
1692                qp->s_flags |= RVT_S_TIMER;
1693                mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies);
1694                if (qp->s_flags & RVT_S_WAIT_ACK) {
1695                        qp->s_flags &= ~RVT_S_WAIT_ACK;
1696                        hfi1_schedule_send(qp);
1697                }
1698
1699                if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE))
1700                        qp->s_retry = qp->s_retry_cnt;
1701
1702                /*
1703                 * Update the RDMA receive state but do the copy w/o
1704                 * holding the locks and blocking interrupts.
1705                 */
1706                qp->s_rdma_read_len -= pmtu;
1707                update_last_psn(qp, psn);
1708                spin_unlock_irqrestore(&qp->s_lock, flags);
1709                hfi1_copy_sge(&qp->s_rdma_read_sge, data, pmtu, 0, 0);
1710                goto bail;
1711
1712        case OP(RDMA_READ_RESPONSE_ONLY):
1713                aeth = be32_to_cpu(ohdr->u.aeth);
1714                if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd))
1715                        goto ack_done;
1716                /* Get the number of bytes the message was padded by. */
1717                pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
1718                /*
1719                 * Check that the data size is >= 0 && <= pmtu.
1720                 * Remember to account for ICRC (4).
1721                 */
1722                if (unlikely(tlen < (hdrsize + pad + 4)))
1723                        goto ack_len_err;
1724                /*
1725                 * If this is a response to a resent RDMA read, we
1726                 * have to be careful to copy the data to the right
1727                 * location.
1728                 */
1729                wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
1730                qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
1731                                                  wqe, psn, pmtu);
1732                goto read_last;
1733
1734        case OP(RDMA_READ_RESPONSE_LAST):
1735                /* ACKs READ req. */
1736                if (unlikely(cmp_psn(psn, qp->s_last_psn + 1)))
1737                        goto ack_seq_err;
1738                if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
1739                        goto ack_op_err;
1740                /* Get the number of bytes the message was padded by. */
1741                pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
1742                /*
1743                 * Check that the data size is >= 1 && <= pmtu.
1744                 * Remember to account for ICRC (4).
1745                 */
1746                if (unlikely(tlen <= (hdrsize + pad + 4)))
1747                        goto ack_len_err;
1748read_last:
1749                tlen -= hdrsize + pad + 4;
1750                if (unlikely(tlen != qp->s_rdma_read_len))
1751                        goto ack_len_err;
1752                aeth = be32_to_cpu(ohdr->u.aeth);
1753                hfi1_copy_sge(&qp->s_rdma_read_sge, data, tlen, 0, 0);
1754                WARN_ON(qp->s_rdma_read_sge.num_sge);
1755                (void)do_rc_ack(qp, aeth, psn,
1756                                 OP(RDMA_READ_RESPONSE_LAST), 0, rcd);
1757                goto ack_done;
1758        }
1759
1760ack_op_err:
1761        status = IB_WC_LOC_QP_OP_ERR;
1762        goto ack_err;
1763
1764ack_seq_err:
1765        rdma_seq_err(qp, ibp, psn, rcd);
1766        goto ack_done;
1767
1768ack_len_err:
1769        status = IB_WC_LOC_LEN_ERR;
1770ack_err:
1771        if (qp->s_last == qp->s_acked) {
1772                hfi1_send_complete(qp, wqe, status);
1773                rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
1774        }
1775ack_done:
1776        spin_unlock_irqrestore(&qp->s_lock, flags);
1777bail:
1778        return;
1779}
1780
1781static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd,
1782                                  struct rvt_qp *qp)
1783{
1784        if (list_empty(&qp->rspwait)) {
1785                qp->r_flags |= RVT_R_RSP_NAK;
1786                rvt_get_qp(qp);
1787                list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
1788        }
1789}
1790
1791static inline void rc_cancel_ack(struct rvt_qp *qp)
1792{
1793        struct hfi1_qp_priv *priv = qp->priv;
1794
1795        priv->r_adefered = 0;
1796        if (list_empty(&qp->rspwait))
1797                return;
1798        list_del_init(&qp->rspwait);
1799        qp->r_flags &= ~RVT_R_RSP_NAK;
1800        rvt_put_qp(qp);
1801}
1802
1803/**
1804 * rc_rcv_error - process an incoming duplicate or error RC packet
1805 * @ohdr: the other headers for this packet
1806 * @data: the packet data
1807 * @qp: the QP for this packet
1808 * @opcode: the opcode for this packet
1809 * @psn: the packet sequence number for this packet
1810 * @diff: the difference between the PSN and the expected PSN
1811 *
1812 * This is called from hfi1_rc_rcv() to process an unexpected
1813 * incoming RC packet for the given QP.
1814 * Called at interrupt level.
1815 * Return 1 if no more processing is needed; otherwise return 0 to
1816 * schedule a response to be sent.
1817 */
1818static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
1819                                 struct rvt_qp *qp, u32 opcode, u32 psn,
1820                                 int diff, struct hfi1_ctxtdata *rcd)
1821{
1822        struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
1823        struct rvt_ack_entry *e;
1824        unsigned long flags;
1825        u8 i, prev;
1826        int old_req;
1827
1828        trace_hfi1_rcv_error(qp, psn);
1829        if (diff > 0) {
1830                /*
1831                 * Packet sequence error.
1832                 * A NAK will ACK earlier sends and RDMA writes.
1833                 * Don't queue the NAK if we already sent one.
1834                 */
1835                if (!qp->r_nak_state) {
1836                        ibp->rvp.n_rc_seqnak++;
1837                        qp->r_nak_state = IB_NAK_PSN_ERROR;
1838                        /* Use the expected PSN. */
1839                        qp->r_ack_psn = qp->r_psn;
1840                        /*
1841                         * Wait to send the sequence NAK until all packets
1842                         * in the receive queue have been processed.
1843                         * Otherwise, we end up propagating congestion.
1844                         */
1845                        rc_defered_ack(rcd, qp);
1846                }
1847                goto done;
1848        }
1849
1850        /*
1851         * Handle a duplicate request.  Don't re-execute SEND, RDMA
1852         * write or atomic op.  Don't NAK errors, just silently drop
1853         * the duplicate request.  Note that r_sge, r_len, and
1854         * r_rcv_len may be in use so don't modify them.
1855         *
1856         * We are supposed to ACK the earliest duplicate PSN but we
1857         * can coalesce an outstanding duplicate ACK.  We have to
1858         * send the earliest so that RDMA reads can be restarted at
1859         * the requester's expected PSN.
1860         *
1861         * First, find where this duplicate PSN falls within the
1862         * ACKs previously sent.
1863         * old_req is true if there is an older response that is scheduled
1864         * to be sent before sending this one.
1865         */
1866        e = NULL;
1867        old_req = 1;
1868        ibp->rvp.n_rc_dupreq++;
1869
1870        spin_lock_irqsave(&qp->s_lock, flags);
1871
1872        for (i = qp->r_head_ack_queue; ; i = prev) {
1873                if (i == qp->s_tail_ack_queue)
1874                        old_req = 0;
1875                if (i)
1876                        prev = i - 1;
1877                else
1878                        prev = HFI1_MAX_RDMA_ATOMIC;
1879                if (prev == qp->r_head_ack_queue) {
1880                        e = NULL;
1881                        break;
1882                }
1883                e = &qp->s_ack_queue[prev];
1884                if (!e->opcode) {
1885                        e = NULL;
1886                        break;
1887                }
1888                if (cmp_psn(psn, e->psn) >= 0) {
1889                        if (prev == qp->s_tail_ack_queue &&
1890                            cmp_psn(psn, e->lpsn) <= 0)
1891                                old_req = 0;
1892                        break;
1893                }
1894        }
1895        switch (opcode) {
1896        case OP(RDMA_READ_REQUEST): {
1897                struct ib_reth *reth;
1898                u32 offset;
1899                u32 len;
1900
1901                /*
1902                 * If we didn't find the RDMA read request in the ack queue,
1903                 * we can ignore this request.
1904                 */
1905                if (!e || e->opcode != OP(RDMA_READ_REQUEST))
1906                        goto unlock_done;
1907                /* RETH comes after BTH */
1908                reth = &ohdr->u.rc.reth;
1909                /*
1910                 * Address range must be a subset of the original
1911                 * request and start on pmtu boundaries.
1912                 * We reuse the old ack_queue slot since the requester
1913                 * should not back up and request an earlier PSN for the
1914                 * same request.
1915                 */
1916                offset = delta_psn(psn, e->psn) * qp->pmtu;
1917                len = be32_to_cpu(reth->length);
1918                if (unlikely(offset + len != e->rdma_sge.sge_length))
1919                        goto unlock_done;
1920                if (e->rdma_sge.mr) {
1921                        rvt_put_mr(e->rdma_sge.mr);
1922                        e->rdma_sge.mr = NULL;
1923                }
1924                if (len != 0) {
1925                        u32 rkey = be32_to_cpu(reth->rkey);
1926                        u64 vaddr = get_ib_reth_vaddr(reth);
1927                        int ok;
1928
1929                        ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
1930                                         IB_ACCESS_REMOTE_READ);
1931                        if (unlikely(!ok))
1932                                goto unlock_done;
1933                } else {
1934                        e->rdma_sge.vaddr = NULL;
1935                        e->rdma_sge.length = 0;
1936                        e->rdma_sge.sge_length = 0;
1937                }
1938                e->psn = psn;
1939                if (old_req)
1940                        goto unlock_done;
1941                qp->s_tail_ack_queue = prev;
1942                break;
1943        }
1944
1945        case OP(COMPARE_SWAP):
1946        case OP(FETCH_ADD): {
1947                /*
1948                 * If we didn't find the atomic request in the ack queue
1949                 * or the send engine is already backed up to send an
1950                 * earlier entry, we can ignore this request.
1951                 */
1952                if (!e || e->opcode != (u8)opcode || old_req)
1953                        goto unlock_done;
1954                qp->s_tail_ack_queue = prev;
1955                break;
1956        }
1957
1958        default:
1959                /*
1960                 * Ignore this operation if it doesn't request an ACK
1961                 * or an earlier RDMA read or atomic is going to be resent.
1962                 */
1963                if (!(psn & IB_BTH_REQ_ACK) || old_req)
1964                        goto unlock_done;
1965                /*
1966                 * Resend the most recent ACK if this request is
1967                 * after all the previous RDMA reads and atomics.
1968                 */
1969                if (i == qp->r_head_ack_queue) {
1970                        spin_unlock_irqrestore(&qp->s_lock, flags);
1971                        qp->r_nak_state = 0;
1972                        qp->r_ack_psn = qp->r_psn - 1;
1973                        goto send_ack;
1974                }
1975
1976                /*
1977                 * Resend the RDMA read or atomic op which
1978                 * ACKs this duplicate request.
1979                 */
1980                qp->s_tail_ack_queue = i;
1981                break;
1982        }
1983        qp->s_ack_state = OP(ACKNOWLEDGE);
1984        qp->s_flags |= RVT_S_RESP_PENDING;
1985        qp->r_nak_state = 0;
1986        hfi1_schedule_send(qp);
1987
1988unlock_done:
1989        spin_unlock_irqrestore(&qp->s_lock, flags);
1990done:
1991        return 1;
1992
1993send_ack:
1994        return 0;
1995}
1996
1997void hfi1_rc_error(struct rvt_qp *qp, enum ib_wc_status err)
1998{
1999        unsigned long flags;
2000        int lastwqe;
2001
2002        spin_lock_irqsave(&qp->s_lock, flags);
2003        lastwqe = rvt_error_qp(qp, err);
2004        spin_unlock_irqrestore(&qp->s_lock, flags);
2005
2006        if (lastwqe) {
2007                struct ib_event ev;
2008
2009                ev.device = qp->ibqp.device;
2010                ev.element.qp = &qp->ibqp;
2011                ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
2012                qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
2013        }
2014}
2015
2016static inline void update_ack_queue(struct rvt_qp *qp, unsigned n)
2017{
2018        unsigned next;
2019
2020        next = n + 1;
2021        if (next > HFI1_MAX_RDMA_ATOMIC)
2022                next = 0;
2023        qp->s_tail_ack_queue = next;
2024        qp->s_ack_state = OP(ACKNOWLEDGE);
2025}
2026
2027static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid,
2028                          u32 lqpn, u32 rqpn, u8 svc_type)
2029{
2030        struct opa_hfi1_cong_log_event_internal *cc_event;
2031        unsigned long flags;
2032
2033        if (sl >= OPA_MAX_SLS)
2034                return;
2035
2036        spin_lock_irqsave(&ppd->cc_log_lock, flags);
2037
2038        ppd->threshold_cong_event_map[sl / 8] |= 1 << (sl % 8);
2039        ppd->threshold_event_counter++;
2040
2041        cc_event = &ppd->cc_events[ppd->cc_log_idx++];
2042        if (ppd->cc_log_idx == OPA_CONG_LOG_ELEMS)
2043                ppd->cc_log_idx = 0;
2044        cc_event->lqpn = lqpn & RVT_QPN_MASK;
2045        cc_event->rqpn = rqpn & RVT_QPN_MASK;
2046        cc_event->sl = sl;
2047        cc_event->svc_type = svc_type;
2048        cc_event->rlid = rlid;
2049        /* keep timestamp in units of 1.024 usec */
2050        cc_event->timestamp = ktime_to_ns(ktime_get()) / 1024;
2051
2052        spin_unlock_irqrestore(&ppd->cc_log_lock, flags);
2053}
2054
2055void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn,
2056                  u32 rqpn, u8 svc_type)
2057{
2058        struct cca_timer *cca_timer;
2059        u16 ccti, ccti_incr, ccti_timer, ccti_limit;
2060        u8 trigger_threshold;
2061        struct cc_state *cc_state;
2062        unsigned long flags;
2063
2064        if (sl >= OPA_MAX_SLS)
2065                return;
2066
2067        cc_state = get_cc_state(ppd);
2068
2069        if (!cc_state)
2070                return;
2071
2072        /*
2073         * 1) increase CCTI (for this SL)
2074         * 2) select IPG (i.e., call set_link_ipg())
2075         * 3) start timer
2076         */
2077        ccti_limit = cc_state->cct.ccti_limit;
2078        ccti_incr = cc_state->cong_setting.entries[sl].ccti_increase;
2079        ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer;
2080        trigger_threshold =
2081                cc_state->cong_setting.entries[sl].trigger_threshold;
2082
2083        spin_lock_irqsave(&ppd->cca_timer_lock, flags);
2084
2085        cca_timer = &ppd->cca_timer[sl];
2086        if (cca_timer->ccti < ccti_limit) {
2087                if (cca_timer->ccti + ccti_incr <= ccti_limit)
2088                        cca_timer->ccti += ccti_incr;
2089                else
2090                        cca_timer->ccti = ccti_limit;
2091                set_link_ipg(ppd);
2092        }
2093
2094        ccti = cca_timer->ccti;
2095
2096        if (!hrtimer_active(&cca_timer->hrtimer)) {
2097                /* ccti_timer is in units of 1.024 usec */
2098                unsigned long nsec = 1024 * ccti_timer;
2099
2100                hrtimer_start(&cca_timer->hrtimer, ns_to_ktime(nsec),
2101                              HRTIMER_MODE_REL);
2102        }
2103
2104        spin_unlock_irqrestore(&ppd->cca_timer_lock, flags);
2105
2106        if ((trigger_threshold != 0) && (ccti >= trigger_threshold))
2107                log_cca_event(ppd, sl, rlid, lqpn, rqpn, svc_type);
2108}
2109
2110/**
2111 * hfi1_rc_rcv - process an incoming RC packet
2112 * @rcd: the context pointer
2113 * @hdr: the header of this packet
2114 * @rcv_flags: flags relevant to rcv processing
2115 * @data: the packet data
2116 * @tlen: the packet length
2117 * @qp: the QP for this packet
2118 *
2119 * This is called from qp_rcv() to process an incoming RC packet
2120 * for the given QP.
2121 * May be called at interrupt level.
2122 */
2123void hfi1_rc_rcv(struct hfi1_packet *packet)
2124{
2125        struct hfi1_ctxtdata *rcd = packet->rcd;
2126        struct ib_header *hdr = packet->hdr;
2127        u32 rcv_flags = packet->rcv_flags;
2128        void *data = packet->ebuf;
2129        u32 tlen = packet->tlen;
2130        struct rvt_qp *qp = packet->qp;
2131        struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
2132        struct ib_other_headers *ohdr = packet->ohdr;
2133        u32 bth0, opcode;
2134        u32 hdrsize = packet->hlen;
2135        u32 psn;
2136        u32 pad;
2137        struct ib_wc wc;
2138        u32 pmtu = qp->pmtu;
2139        int diff;
2140        struct ib_reth *reth;
2141        unsigned long flags;
2142        int ret, is_fecn = 0;
2143        int copy_last = 0;
2144        u32 rkey;
2145
2146        lockdep_assert_held(&qp->r_lock);
2147        bth0 = be32_to_cpu(ohdr->bth[0]);
2148        if (hfi1_ruc_check_hdr(ibp, hdr, rcv_flags & HFI1_HAS_GRH, qp, bth0))
2149                return;
2150
2151        is_fecn = process_ecn(qp, packet, false);
2152
2153        psn = be32_to_cpu(ohdr->bth[2]);
2154        opcode = (bth0 >> 24) & 0xff;
2155
2156        /*
2157         * Process responses (ACKs) before anything else.  Note that the
2158         * packet sequence number will be for something in the send work
2159         * queue rather than the expected receive packet sequence number.
2160         * In other words, this QP is the requester.
2161         */
2162        if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
2163            opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
2164                rc_rcv_resp(ibp, ohdr, data, tlen, qp, opcode, psn,
2165                            hdrsize, pmtu, rcd);
2166                if (is_fecn)
2167                        goto send_ack;
2168                return;
2169        }
2170
2171        /* Compute 24 bits worth of difference. */
2172        diff = delta_psn(psn, qp->r_psn);
2173        if (unlikely(diff)) {
2174                if (rc_rcv_error(ohdr, data, qp, opcode, psn, diff, rcd))
2175                        return;
2176                goto send_ack;
2177        }
2178
2179        /* Check for opcode sequence errors. */
2180        switch (qp->r_state) {
2181        case OP(SEND_FIRST):
2182        case OP(SEND_MIDDLE):
2183                if (opcode == OP(SEND_MIDDLE) ||
2184                    opcode == OP(SEND_LAST) ||
2185                    opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
2186                    opcode == OP(SEND_LAST_WITH_INVALIDATE))
2187                        break;
2188                goto nack_inv;
2189
2190        case OP(RDMA_WRITE_FIRST):
2191        case OP(RDMA_WRITE_MIDDLE):
2192                if (opcode == OP(RDMA_WRITE_MIDDLE) ||
2193                    opcode == OP(RDMA_WRITE_LAST) ||
2194                    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
2195                        break;
2196                goto nack_inv;
2197
2198        default:
2199                if (opcode == OP(SEND_MIDDLE) ||
2200                    opcode == OP(SEND_LAST) ||
2201                    opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
2202                    opcode == OP(SEND_LAST_WITH_INVALIDATE) ||
2203                    opcode == OP(RDMA_WRITE_MIDDLE) ||
2204                    opcode == OP(RDMA_WRITE_LAST) ||
2205                    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
2206                        goto nack_inv;
2207                /*
2208                 * Note that it is up to the requester to not send a new
2209                 * RDMA read or atomic operation before receiving an ACK
2210                 * for the previous operation.
2211                 */
2212                break;
2213        }
2214
2215        if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
2216                qp_comm_est(qp);
2217
2218        /* OK, process the packet. */
2219        switch (opcode) {
2220        case OP(SEND_FIRST):
2221                ret = hfi1_rvt_get_rwqe(qp, 0);
2222                if (ret < 0)
2223                        goto nack_op_err;
2224                if (!ret)
2225                        goto rnr_nak;
2226                qp->r_rcv_len = 0;
2227                /* FALLTHROUGH */
2228        case OP(SEND_MIDDLE):
2229        case OP(RDMA_WRITE_MIDDLE):
2230send_middle:
2231                /* Check for invalid length PMTU or posted rwqe len. */
2232                if (unlikely(tlen != (hdrsize + pmtu + 4)))
2233                        goto nack_inv;
2234                qp->r_rcv_len += pmtu;
2235                if (unlikely(qp->r_rcv_len > qp->r_len))
2236                        goto nack_inv;
2237                hfi1_copy_sge(&qp->r_sge, data, pmtu, 1, 0);
2238                break;
2239
2240        case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
2241                /* consume RWQE */
2242                ret = hfi1_rvt_get_rwqe(qp, 1);
2243                if (ret < 0)
2244                        goto nack_op_err;
2245                if (!ret)
2246                        goto rnr_nak;
2247                goto send_last_imm;
2248
2249        case OP(SEND_ONLY):
2250        case OP(SEND_ONLY_WITH_IMMEDIATE):
2251        case OP(SEND_ONLY_WITH_INVALIDATE):
2252                ret = hfi1_rvt_get_rwqe(qp, 0);
2253                if (ret < 0)
2254                        goto nack_op_err;
2255                if (!ret)
2256                        goto rnr_nak;
2257                qp->r_rcv_len = 0;
2258                if (opcode == OP(SEND_ONLY))
2259                        goto no_immediate_data;
2260                if (opcode == OP(SEND_ONLY_WITH_INVALIDATE))
2261                        goto send_last_inv;
2262                /* FALLTHROUGH for SEND_ONLY_WITH_IMMEDIATE */
2263        case OP(SEND_LAST_WITH_IMMEDIATE):
2264send_last_imm:
2265                wc.ex.imm_data = ohdr->u.imm_data;
2266                wc.wc_flags = IB_WC_WITH_IMM;
2267                goto send_last;
2268        case OP(SEND_LAST_WITH_INVALIDATE):
2269send_last_inv:
2270                rkey = be32_to_cpu(ohdr->u.ieth);
2271                if (rvt_invalidate_rkey(qp, rkey))
2272                        goto no_immediate_data;
2273                wc.ex.invalidate_rkey = rkey;
2274                wc.wc_flags = IB_WC_WITH_INVALIDATE;
2275                goto send_last;
2276        case OP(RDMA_WRITE_LAST):
2277                copy_last = ibpd_to_rvtpd(qp->ibqp.pd)->user;
2278                /* fall through */
2279        case OP(SEND_LAST):
2280no_immediate_data:
2281                wc.wc_flags = 0;
2282                wc.ex.imm_data = 0;
2283send_last:
2284                /* Get the number of bytes the message was padded by. */
2285                pad = (bth0 >> 20) & 3;
2286                /* Check for invalid length. */
2287                /* LAST len should be >= 1 */
2288                if (unlikely(tlen < (hdrsize + pad + 4)))
2289                        goto nack_inv;
2290                /* Don't count the CRC. */
2291                tlen -= (hdrsize + pad + 4);
2292                wc.byte_len = tlen + qp->r_rcv_len;
2293                if (unlikely(wc.byte_len > qp->r_len))
2294                        goto nack_inv;
2295                hfi1_copy_sge(&qp->r_sge, data, tlen, 1, copy_last);
2296                rvt_put_ss(&qp->r_sge);
2297                qp->r_msn++;
2298                if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
2299                        break;
2300                wc.wr_id = qp->r_wr_id;
2301                wc.status = IB_WC_SUCCESS;
2302                if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||
2303                    opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
2304                        wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
2305                else
2306                        wc.opcode = IB_WC_RECV;
2307                wc.qp = &qp->ibqp;
2308                wc.src_qp = qp->remote_qpn;
2309                wc.slid = qp->remote_ah_attr.dlid;
2310                /*
2311                 * It seems that IB mandates the presence of an SL in a
2312                 * work completion only for the UD transport (see section
2313                 * 11.4.2 of IBTA Vol. 1).
2314                 *
2315                 * However, the way the SL is chosen below is consistent
2316                 * with the way that IB/qib works and is trying avoid
2317                 * introducing incompatibilities.
2318                 *
2319                 * See also OPA Vol. 1, section 9.7.6, and table 9-17.
2320                 */
2321                wc.sl = qp->remote_ah_attr.sl;
2322                /* zero fields that are N/A */
2323                wc.vendor_err = 0;
2324                wc.pkey_index = 0;
2325                wc.dlid_path_bits = 0;
2326                wc.port_num = 0;
2327                /* Signal completion event if the solicited bit is set. */
2328                rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
2329                             (bth0 & IB_BTH_SOLICITED) != 0);
2330                break;
2331
2332        case OP(RDMA_WRITE_ONLY):
2333                copy_last = 1;
2334                /* fall through */
2335        case OP(RDMA_WRITE_FIRST):
2336        case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
2337                if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
2338                        goto nack_inv;
2339                /* consume RWQE */
2340                reth = &ohdr->u.rc.reth;
2341                qp->r_len = be32_to_cpu(reth->length);
2342                qp->r_rcv_len = 0;
2343                qp->r_sge.sg_list = NULL;
2344                if (qp->r_len != 0) {
2345                        u32 rkey = be32_to_cpu(reth->rkey);
2346                        u64 vaddr = get_ib_reth_vaddr(reth);
2347                        int ok;
2348
2349                        /* Check rkey & NAK */
2350                        ok = rvt_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr,
2351                                         rkey, IB_ACCESS_REMOTE_WRITE);
2352                        if (unlikely(!ok))
2353                                goto nack_acc;
2354                        qp->r_sge.num_sge = 1;
2355                } else {
2356                        qp->r_sge.num_sge = 0;
2357                        qp->r_sge.sge.mr = NULL;
2358                        qp->r_sge.sge.vaddr = NULL;
2359                        qp->r_sge.sge.length = 0;
2360                        qp->r_sge.sge.sge_length = 0;
2361                }
2362                if (opcode == OP(RDMA_WRITE_FIRST))
2363                        goto send_middle;
2364                else if (opcode == OP(RDMA_WRITE_ONLY))
2365                        goto no_immediate_data;
2366                ret = hfi1_rvt_get_rwqe(qp, 1);
2367                if (ret < 0)
2368                        goto nack_op_err;
2369                if (!ret)
2370                        goto rnr_nak;
2371                wc.ex.imm_data = ohdr->u.rc.imm_data;
2372                wc.wc_flags = IB_WC_WITH_IMM;
2373                goto send_last;
2374
2375        case OP(RDMA_READ_REQUEST): {
2376                struct rvt_ack_entry *e;
2377                u32 len;
2378                u8 next;
2379
2380                if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
2381                        goto nack_inv;
2382                next = qp->r_head_ack_queue + 1;
2383                /* s_ack_queue is size HFI1_MAX_RDMA_ATOMIC+1 so use > not >= */
2384                if (next > HFI1_MAX_RDMA_ATOMIC)
2385                        next = 0;
2386                spin_lock_irqsave(&qp->s_lock, flags);
2387                if (unlikely(next == qp->s_tail_ack_queue)) {
2388                        if (!qp->s_ack_queue[next].sent)
2389                                goto nack_inv_unlck;
2390                        update_ack_queue(qp, next);
2391                }
2392                e = &qp->s_ack_queue[qp->r_head_ack_queue];
2393                if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
2394                        rvt_put_mr(e->rdma_sge.mr);
2395                        e->rdma_sge.mr = NULL;
2396                }
2397                reth = &ohdr->u.rc.reth;
2398                len = be32_to_cpu(reth->length);
2399                if (len) {
2400                        u32 rkey = be32_to_cpu(reth->rkey);
2401                        u64 vaddr = get_ib_reth_vaddr(reth);
2402                        int ok;
2403
2404                        /* Check rkey & NAK */
2405                        ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr,
2406                                         rkey, IB_ACCESS_REMOTE_READ);
2407                        if (unlikely(!ok))
2408                                goto nack_acc_unlck;
2409                        /*
2410                         * Update the next expected PSN.  We add 1 later
2411                         * below, so only add the remainder here.
2412                         */
2413                        if (len > pmtu)
2414                                qp->r_psn += (len - 1) / pmtu;
2415                } else {
2416                        e->rdma_sge.mr = NULL;
2417                        e->rdma_sge.vaddr = NULL;
2418                        e->rdma_sge.length = 0;
2419                        e->rdma_sge.sge_length = 0;
2420                }
2421                e->opcode = opcode;
2422                e->sent = 0;
2423                e->psn = psn;
2424                e->lpsn = qp->r_psn;
2425                /*
2426                 * We need to increment the MSN here instead of when we
2427                 * finish sending the result since a duplicate request would
2428                 * increment it more than once.
2429                 */
2430                qp->r_msn++;
2431                qp->r_psn++;
2432                qp->r_state = opcode;
2433                qp->r_nak_state = 0;
2434                qp->r_head_ack_queue = next;
2435
2436                /* Schedule the send engine. */
2437                qp->s_flags |= RVT_S_RESP_PENDING;
2438                hfi1_schedule_send(qp);
2439
2440                spin_unlock_irqrestore(&qp->s_lock, flags);
2441                if (is_fecn)
2442                        goto send_ack;
2443                return;
2444        }
2445
2446        case OP(COMPARE_SWAP):
2447        case OP(FETCH_ADD): {
2448                struct ib_atomic_eth *ateth;
2449                struct rvt_ack_entry *e;
2450                u64 vaddr;
2451                atomic64_t *maddr;
2452                u64 sdata;
2453                u32 rkey;
2454                u8 next;
2455
2456                if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
2457                        goto nack_inv;
2458                next = qp->r_head_ack_queue + 1;
2459                if (next > HFI1_MAX_RDMA_ATOMIC)
2460                        next = 0;
2461                spin_lock_irqsave(&qp->s_lock, flags);
2462                if (unlikely(next == qp->s_tail_ack_queue)) {
2463                        if (!qp->s_ack_queue[next].sent)
2464                                goto nack_inv_unlck;
2465                        update_ack_queue(qp, next);
2466                }
2467                e = &qp->s_ack_queue[qp->r_head_ack_queue];
2468                if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
2469                        rvt_put_mr(e->rdma_sge.mr);
2470                        e->rdma_sge.mr = NULL;
2471                }
2472                ateth = &ohdr->u.atomic_eth;
2473                vaddr = get_ib_ateth_vaddr(ateth);
2474                if (unlikely(vaddr & (sizeof(u64) - 1)))
2475                        goto nack_inv_unlck;
2476                rkey = be32_to_cpu(ateth->rkey);
2477                /* Check rkey & NAK */
2478                if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
2479                                          vaddr, rkey,
2480                                          IB_ACCESS_REMOTE_ATOMIC)))
2481                        goto nack_acc_unlck;
2482                /* Perform atomic OP and save result. */
2483                maddr = (atomic64_t *)qp->r_sge.sge.vaddr;
2484                sdata = get_ib_ateth_swap(ateth);
2485                e->atomic_data = (opcode == OP(FETCH_ADD)) ?
2486                        (u64)atomic64_add_return(sdata, maddr) - sdata :
2487                        (u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr,
2488                                      get_ib_ateth_compare(ateth),
2489                                      sdata);
2490                rvt_put_mr(qp->r_sge.sge.mr);
2491                qp->r_sge.num_sge = 0;
2492                e->opcode = opcode;
2493                e->sent = 0;
2494                e->psn = psn;
2495                e->lpsn = psn;
2496                qp->r_msn++;
2497                qp->r_psn++;
2498                qp->r_state = opcode;
2499                qp->r_nak_state = 0;
2500                qp->r_head_ack_queue = next;
2501
2502                /* Schedule the send engine. */
2503                qp->s_flags |= RVT_S_RESP_PENDING;
2504                hfi1_schedule_send(qp);
2505
2506                spin_unlock_irqrestore(&qp->s_lock, flags);
2507                if (is_fecn)
2508                        goto send_ack;
2509                return;
2510        }
2511
2512        default:
2513                /* NAK unknown opcodes. */
2514                goto nack_inv;
2515        }
2516        qp->r_psn++;
2517        qp->r_state = opcode;
2518        qp->r_ack_psn = psn;
2519        qp->r_nak_state = 0;
2520        /* Send an ACK if requested or required. */
2521        if (psn & IB_BTH_REQ_ACK) {
2522                struct hfi1_qp_priv *priv = qp->priv;
2523
2524                if (packet->numpkt == 0) {
2525                        rc_cancel_ack(qp);
2526                        goto send_ack;
2527                }
2528                if (priv->r_adefered >= HFI1_PSN_CREDIT) {
2529                        rc_cancel_ack(qp);
2530                        goto send_ack;
2531                }
2532                if (unlikely(is_fecn)) {
2533                        rc_cancel_ack(qp);
2534                        goto send_ack;
2535                }
2536                priv->r_adefered++;
2537                rc_defered_ack(rcd, qp);
2538        }
2539        return;
2540
2541rnr_nak:
2542        qp->r_nak_state = qp->r_min_rnr_timer | IB_RNR_NAK;
2543        qp->r_ack_psn = qp->r_psn;
2544        /* Queue RNR NAK for later */
2545        rc_defered_ack(rcd, qp);
2546        return;
2547
2548nack_op_err:
2549        hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
2550        qp->r_nak_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
2551        qp->r_ack_psn = qp->r_psn;
2552        /* Queue NAK for later */
2553        rc_defered_ack(rcd, qp);
2554        return;
2555
2556nack_inv_unlck:
2557        spin_unlock_irqrestore(&qp->s_lock, flags);
2558nack_inv:
2559        hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
2560        qp->r_nak_state = IB_NAK_INVALID_REQUEST;
2561        qp->r_ack_psn = qp->r_psn;
2562        /* Queue NAK for later */
2563        rc_defered_ack(rcd, qp);
2564        return;
2565
2566nack_acc_unlck:
2567        spin_unlock_irqrestore(&qp->s_lock, flags);
2568nack_acc:
2569        hfi1_rc_error(qp, IB_WC_LOC_PROT_ERR);
2570        qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
2571        qp->r_ack_psn = qp->r_psn;
2572send_ack:
2573        hfi1_send_rc_ack(rcd, qp, is_fecn);
2574}
2575
2576void hfi1_rc_hdrerr(
2577        struct hfi1_ctxtdata *rcd,
2578        struct ib_header *hdr,
2579        u32 rcv_flags,
2580        struct rvt_qp *qp)
2581{
2582        int has_grh = rcv_flags & HFI1_HAS_GRH;
2583        struct ib_other_headers *ohdr;
2584        struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
2585        int diff;
2586        u32 opcode;
2587        u32 psn, bth0;
2588
2589        /* Check for GRH */
2590        ohdr = &hdr->u.oth;
2591        if (has_grh)
2592                ohdr = &hdr->u.l.oth;
2593
2594        bth0 = be32_to_cpu(ohdr->bth[0]);
2595        if (hfi1_ruc_check_hdr(ibp, hdr, has_grh, qp, bth0))
2596                return;
2597
2598        psn = be32_to_cpu(ohdr->bth[2]);
2599        opcode = (bth0 >> 24) & 0xff;
2600
2601        /* Only deal with RDMA Writes for now */
2602        if (opcode < IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) {
2603                diff = delta_psn(psn, qp->r_psn);
2604                if (!qp->r_nak_state && diff >= 0) {
2605                        ibp->rvp.n_rc_seqnak++;
2606                        qp->r_nak_state = IB_NAK_PSN_ERROR;
2607                        /* Use the expected PSN. */
2608                        qp->r_ack_psn = qp->r_psn;
2609                        /*
2610                         * Wait to send the sequence
2611                         * NAK until all packets
2612                         * in the receive queue have
2613                         * been processed.
2614                         * Otherwise, we end up
2615                         * propagating congestion.
2616                         */
2617                        rc_defered_ack(rcd, qp);
2618                } /* Out of sequence NAK */
2619        } /* QP Request NAKs */
2620}
2621