LXR linux/drivers/infiniband/hw/hfi1/tid

   1// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
   2/*
   3 * Copyright(c) 2018 Intel Corporation.
   4 *
   5 */
   6
   7#include "hfi.h"
   8#include "qp.h"
   9#include "rc.h"
  10#include "verbs.h"
  11#include "tid_rdma.h"
  12#include "exp_rcv.h"
  13#include "trace.h"
  14
  15/**
  16 * DOC: TID RDMA READ protocol
  17 *
  18 * This is an end-to-end protocol at the hfi1 level between two nodes that
  19 * improves performance by avoiding data copy on the requester side. It
  20 * converts a qualified RDMA READ request into a TID RDMA READ request on
  21 * the requester side and thereafter handles the request and response
  22 * differently. To be qualified, the RDMA READ request should meet the
  23 * following:
  24 * -- The total data length should be greater than 256K;
  25 * -- The total data length should be a multiple of 4K page size;
  26 * -- Each local scatter-gather entry should be 4K page aligned;
  27 * -- Each local scatter-gather entry should be a multiple of 4K page size;
  28 */
  29
  30#define RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK BIT_ULL(32)
  31#define RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK BIT_ULL(33)
  32#define RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK BIT_ULL(34)
  33#define RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK BIT_ULL(35)
  34#define RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK BIT_ULL(37)
  35#define RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK BIT_ULL(38)
  36
  37/* Maximum number of packets within a flow generation. */
  38#define MAX_TID_FLOW_PSN BIT(HFI1_KDETH_BTH_SEQ_SHIFT)
  39
  40#define GENERATION_MASK 0xFFFFF
  41
  42static u32 mask_generation(u32 a)
  43{
  44        return a & GENERATION_MASK;
  45}
  46
  47/* Reserved generation value to set to unused flows for kernel contexts */
  48#define KERN_GENERATION_RESERVED mask_generation(U32_MAX)
  49
  50/*
  51 * J_KEY for kernel contexts when TID RDMA is used.
  52 * See generate_jkey() in hfi.h for more information.
  53 */
  54#define TID_RDMA_JKEY                   32
  55#define HFI1_KERNEL_MIN_JKEY HFI1_ADMIN_JKEY_RANGE
  56#define HFI1_KERNEL_MAX_JKEY (2 * HFI1_ADMIN_JKEY_RANGE - 1)
  57
  58/* Maximum number of segments in flight per QP request. */
  59#define TID_RDMA_MAX_READ_SEGS_PER_REQ  6
  60#define TID_RDMA_MAX_WRITE_SEGS_PER_REQ 4
  61#define MAX_REQ max_t(u16, TID_RDMA_MAX_READ_SEGS_PER_REQ, \
  62                        TID_RDMA_MAX_WRITE_SEGS_PER_REQ)
  63#define MAX_FLOWS roundup_pow_of_two(MAX_REQ + 1)
  64
  65#define MAX_EXPECTED_PAGES     (MAX_EXPECTED_BUFFER / PAGE_SIZE)
  66
  67#define TID_RDMA_DESTQP_FLOW_SHIFT      11
  68#define TID_RDMA_DESTQP_FLOW_MASK       0x1f
  69
  70#define TID_OPFN_QP_CTXT_MASK 0xff
  71#define TID_OPFN_QP_CTXT_SHIFT 56
  72#define TID_OPFN_QP_KDETH_MASK 0xff
  73#define TID_OPFN_QP_KDETH_SHIFT 48
  74#define TID_OPFN_MAX_LEN_MASK 0x7ff
  75#define TID_OPFN_MAX_LEN_SHIFT 37
  76#define TID_OPFN_TIMEOUT_MASK 0x1f
  77#define TID_OPFN_TIMEOUT_SHIFT 32
  78#define TID_OPFN_RESERVED_MASK 0x3f
  79#define TID_OPFN_RESERVED_SHIFT 26
  80#define TID_OPFN_URG_MASK 0x1
  81#define TID_OPFN_URG_SHIFT 25
  82#define TID_OPFN_VER_MASK 0x7
  83#define TID_OPFN_VER_SHIFT 22
  84#define TID_OPFN_JKEY_MASK 0x3f
  85#define TID_OPFN_JKEY_SHIFT 16
  86#define TID_OPFN_MAX_READ_MASK 0x3f
  87#define TID_OPFN_MAX_READ_SHIFT 10
  88#define TID_OPFN_MAX_WRITE_MASK 0x3f
  89#define TID_OPFN_MAX_WRITE_SHIFT 4
  90
  91/*
  92 * OPFN TID layout
  93 *
  94 * 63               47               31               15
  95 * NNNNNNNNKKKKKKKK MMMMMMMMMMMTTTTT DDDDDDUVVVJJJJJJ RRRRRRWWWWWWCCCC
  96 * 3210987654321098 7654321098765432 1098765432109876 5432109876543210
  97 * N - the context Number
  98 * K - the Kdeth_qp
  99 * M - Max_len
 100 * T - Timeout
 101 * D - reserveD
 102 * V - version
 103 * U - Urg capable
 104 * J - Jkey
 105 * R - max_Read
 106 * W - max_Write
 107 * C - Capcode
 108 */
 109
 110static u32 tid_rdma_flow_wt;
 111
 112static void tid_rdma_trigger_resume(struct work_struct *work);
 113static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req);
 114static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req,
 115                                         gfp_t gfp);
 116static void hfi1_init_trdma_req(struct rvt_qp *qp,
 117                                struct tid_rdma_request *req);
 118static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx);
 119static void hfi1_tid_timeout(struct timer_list *t);
 120static void hfi1_add_tid_reap_timer(struct rvt_qp *qp);
 121static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp);
 122static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp);
 123static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp);
 124static void hfi1_tid_retry_timeout(struct timer_list *t);
 125static int make_tid_rdma_ack(struct rvt_qp *qp,
 126                             struct ib_other_headers *ohdr,
 127                             struct hfi1_pkt_state *ps);
 128static void hfi1_do_tid_send(struct rvt_qp *qp);
 129static u32 read_r_next_psn(struct hfi1_devdata *dd, u8 ctxt, u8 fidx);
 130static void tid_rdma_rcv_err(struct hfi1_packet *packet,
 131                             struct ib_other_headers *ohdr,
 132                             struct rvt_qp *qp, u32 psn, int diff, bool fecn);
 133static void update_r_next_psn_fecn(struct hfi1_packet *packet,
 134                                   struct hfi1_qp_priv *priv,
 135                                   struct hfi1_ctxtdata *rcd,
 136                                   struct tid_rdma_flow *flow,
 137                                   bool fecn);
 138
 139static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p)
 140{
 141        return
 142                (((u64)p->qp & TID_OPFN_QP_CTXT_MASK) <<
 143                        TID_OPFN_QP_CTXT_SHIFT) |
 144                ((((u64)p->qp >> 16) & TID_OPFN_QP_KDETH_MASK) <<
 145                        TID_OPFN_QP_KDETH_SHIFT) |
 146                (((u64)((p->max_len >> PAGE_SHIFT) - 1) &
 147                        TID_OPFN_MAX_LEN_MASK) << TID_OPFN_MAX_LEN_SHIFT) |
 148                (((u64)p->timeout & TID_OPFN_TIMEOUT_MASK) <<
 149                        TID_OPFN_TIMEOUT_SHIFT) |
 150                (((u64)p->urg & TID_OPFN_URG_MASK) << TID_OPFN_URG_SHIFT) |
 151                (((u64)p->jkey & TID_OPFN_JKEY_MASK) << TID_OPFN_JKEY_SHIFT) |
 152                (((u64)p->max_read & TID_OPFN_MAX_READ_MASK) <<
 153                        TID_OPFN_MAX_READ_SHIFT) |
 154                (((u64)p->max_write & TID_OPFN_MAX_WRITE_MASK) <<
 155                        TID_OPFN_MAX_WRITE_SHIFT);
 156}
 157
 158static void tid_rdma_opfn_decode(struct tid_rdma_params *p, u64 data)
 159{
 160        p->max_len = (((data >> TID_OPFN_MAX_LEN_SHIFT) &
 161                TID_OPFN_MAX_LEN_MASK) + 1) << PAGE_SHIFT;
 162        p->jkey = (data >> TID_OPFN_JKEY_SHIFT) & TID_OPFN_JKEY_MASK;
 163        p->max_write = (data >> TID_OPFN_MAX_WRITE_SHIFT) &
 164                TID_OPFN_MAX_WRITE_MASK;
 165        p->max_read = (data >> TID_OPFN_MAX_READ_SHIFT) &
 166                TID_OPFN_MAX_READ_MASK;
 167        p->qp =
 168                ((((data >> TID_OPFN_QP_KDETH_SHIFT) & TID_OPFN_QP_KDETH_MASK)
 169                        << 16) |
 170                ((data >> TID_OPFN_QP_CTXT_SHIFT) & TID_OPFN_QP_CTXT_MASK));
 171        p->urg = (data >> TID_OPFN_URG_SHIFT) & TID_OPFN_URG_MASK;
 172        p->timeout = (data >> TID_OPFN_TIMEOUT_SHIFT) & TID_OPFN_TIMEOUT_MASK;
 173}
 174
 175void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p)
 176{
 177        struct hfi1_qp_priv *priv = qp->priv;
 178
 179        p->qp = (kdeth_qp << 16) | priv->rcd->ctxt;
 180        p->max_len = TID_RDMA_MAX_SEGMENT_SIZE;
 181        p->jkey = priv->rcd->jkey;
 182        p->max_read = TID_RDMA_MAX_READ_SEGS_PER_REQ;
 183        p->max_write = TID_RDMA_MAX_WRITE_SEGS_PER_REQ;
 184        p->timeout = qp->timeout;
 185        p->urg = is_urg_masked(priv->rcd);
 186}
 187
 188bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data)
 189{
 190        struct hfi1_qp_priv *priv = qp->priv;
 191
 192        *data = tid_rdma_opfn_encode(&priv->tid_rdma.local);
 193        return true;
 194}
 195
 196bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data)
 197{
 198        struct hfi1_qp_priv *priv = qp->priv;
 199        struct tid_rdma_params *remote, *old;
 200        bool ret = true;
 201
 202        old = rcu_dereference_protected(priv->tid_rdma.remote,
 203                                        lockdep_is_held(&priv->opfn.lock));
 204        data &= ~0xfULL;
 205        /*
 206         * If data passed in is zero, return true so as not to continue the
 207         * negotiation process
 208         */
 209        if (!data || !HFI1_CAP_IS_KSET(TID_RDMA))
 210                goto null;
 211        /*
 212         * If kzalloc fails, return false. This will result in:
 213         * * at the requester a new OPFN request being generated to retry
 214         *   the negotiation
 215         * * at the responder, 0 being returned to the requester so as to
 216         *   disable TID RDMA at both the requester and the responder
 217         */
 218        remote = kzalloc(sizeof(*remote), GFP_ATOMIC);
 219        if (!remote) {
 220                ret = false;
 221                goto null;
 222        }
 223
 224        tid_rdma_opfn_decode(remote, data);
 225        priv->tid_timer_timeout_jiffies =
 226                usecs_to_jiffies((((4096UL * (1UL << remote->timeout)) /
 227                                   1000UL) << 3) * 7);
 228        trace_hfi1_opfn_param(qp, 0, &priv->tid_rdma.local);
 229        trace_hfi1_opfn_param(qp, 1, remote);
 230        rcu_assign_pointer(priv->tid_rdma.remote, remote);
 231        /*
 232         * A TID RDMA READ request's segment size is not equal to
 233         * remote->max_len only when the request's data length is smaller
 234         * than remote->max_len. In that case, there will be only one segment.
 235         * Therefore, when priv->pkts_ps is used to calculate req->cur_seg
 236         * during retry, it will lead to req->cur_seg = 0, which is exactly
 237         * what is expected.
 238         */
 239        priv->pkts_ps = (u16)rvt_div_mtu(qp, remote->max_len);
 240        priv->timeout_shift = ilog2(priv->pkts_ps - 1) + 1;
 241        goto free;
 242null:
 243        RCU_INIT_POINTER(priv->tid_rdma.remote, NULL);
 244        priv->timeout_shift = 0;
 245free:
 246        if (old)
 247                kfree_rcu(old, rcu_head);
 248        return ret;
 249}
 250
 251bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data)
 252{
 253        bool ret;
 254
 255        ret = tid_rdma_conn_reply(qp, *data);
 256        *data = 0;
 257        /*
 258         * If tid_rdma_conn_reply() returns error, set *data as 0 to indicate
 259         * TID RDMA could not be enabled. This will result in TID RDMA being
 260         * disabled at the requester too.
 261         */
 262        if (ret)
 263                (void)tid_rdma_conn_req(qp, data);
 264        return ret;
 265}
 266
 267void tid_rdma_conn_error(struct rvt_qp *qp)
 268{
 269        struct hfi1_qp_priv *priv = qp->priv;
 270        struct tid_rdma_params *old;
 271
 272        old = rcu_dereference_protected(priv->tid_rdma.remote,
 273                                        lockdep_is_held(&priv->opfn.lock));
 274        RCU_INIT_POINTER(priv->tid_rdma.remote, NULL);
 275        if (old)
 276                kfree_rcu(old, rcu_head);
 277}
 278
 279/* This is called at context initialization time */
 280int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit)
 281{
 282        if (reinit)
 283                return 0;
 284
 285        BUILD_BUG_ON(TID_RDMA_JKEY < HFI1_KERNEL_MIN_JKEY);
 286        BUILD_BUG_ON(TID_RDMA_JKEY > HFI1_KERNEL_MAX_JKEY);
 287        rcd->jkey = TID_RDMA_JKEY;
 288        hfi1_set_ctxt_jkey(rcd->dd, rcd, rcd->jkey);
 289        return hfi1_alloc_ctxt_rcv_groups(rcd);
 290}
 291
 292/**
 293 * qp_to_rcd - determine the receive context used by a qp
 294 * @qp - the qp
 295 *
 296 * This routine returns the receive context associated
 297 * with a a qp's qpn.
 298 *
 299 * Returns the context.
 300 */
 301static struct hfi1_ctxtdata *qp_to_rcd(struct rvt_dev_info *rdi,
 302                                       struct rvt_qp *qp)
 303{
 304        struct hfi1_ibdev *verbs_dev = container_of(rdi,
 305                                                    struct hfi1_ibdev,
 306                                                    rdi);
 307        struct hfi1_devdata *dd = container_of(verbs_dev,
 308                                               struct hfi1_devdata,
 309                                               verbs_dev);
 310        unsigned int ctxt;
 311
 312        if (qp->ibqp.qp_num == 0)
 313                ctxt = 0;
 314        else
 315                ctxt = hfi1_get_qp_map(dd, qp->ibqp.qp_num >> dd->qos_shift);
 316        return dd->rcd[ctxt];
 317}
 318
 319int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp,
 320                      struct ib_qp_init_attr *init_attr)
 321{
 322        struct hfi1_qp_priv *qpriv = qp->priv;
 323        int i, ret;
 324
 325        qpriv->rcd = qp_to_rcd(rdi, qp);
 326
 327        spin_lock_init(&qpriv->opfn.lock);
 328        INIT_WORK(&qpriv->opfn.opfn_work, opfn_send_conn_request);
 329        INIT_WORK(&qpriv->tid_rdma.trigger_work, tid_rdma_trigger_resume);
 330        qpriv->flow_state.psn = 0;
 331        qpriv->flow_state.index = RXE_NUM_TID_FLOWS;
 332        qpriv->flow_state.last_index = RXE_NUM_TID_FLOWS;
 333        qpriv->flow_state.generation = KERN_GENERATION_RESERVED;
 334        qpriv->s_state = TID_OP(WRITE_RESP);
 335        qpriv->s_tid_cur = HFI1_QP_WQE_INVALID;
 336        qpriv->s_tid_head = HFI1_QP_WQE_INVALID;
 337        qpriv->s_tid_tail = HFI1_QP_WQE_INVALID;
 338        qpriv->rnr_nak_state = TID_RNR_NAK_INIT;
 339        qpriv->r_tid_head = HFI1_QP_WQE_INVALID;
 340        qpriv->r_tid_tail = HFI1_QP_WQE_INVALID;
 341        qpriv->r_tid_ack = HFI1_QP_WQE_INVALID;
 342        qpriv->r_tid_alloc = HFI1_QP_WQE_INVALID;
 343        atomic_set(&qpriv->n_requests, 0);
 344        atomic_set(&qpriv->n_tid_requests, 0);
 345        timer_setup(&qpriv->s_tid_timer, hfi1_tid_timeout, 0);
 346        timer_setup(&qpriv->s_tid_retry_timer, hfi1_tid_retry_timeout, 0);
 347        INIT_LIST_HEAD(&qpriv->tid_wait);
 348
 349        if (init_attr->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
 350                struct hfi1_devdata *dd = qpriv->rcd->dd;
 351
 352                qpriv->pages = kzalloc_node(TID_RDMA_MAX_PAGES *
 353                                                sizeof(*qpriv->pages),
 354                                            GFP_KERNEL, dd->node);
 355                if (!qpriv->pages)
 356                        return -ENOMEM;
 357                for (i = 0; i < qp->s_size; i++) {
 358                        struct hfi1_swqe_priv *priv;
 359                        struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i);
 360
 361                        priv = kzalloc_node(sizeof(*priv), GFP_KERNEL,
 362                                            dd->node);
 363                        if (!priv)
 364                                return -ENOMEM;
 365
 366                        hfi1_init_trdma_req(qp, &priv->tid_req);
 367                        priv->tid_req.e.swqe = wqe;
 368                        wqe->priv = priv;
 369                }
 370                for (i = 0; i < rvt_max_atomic(rdi); i++) {
 371                        struct hfi1_ack_priv *priv;
 372
 373                        priv = kzalloc_node(sizeof(*priv), GFP_KERNEL,
 374                                            dd->node);
 375                        if (!priv)
 376                                return -ENOMEM;
 377
 378                        hfi1_init_trdma_req(qp, &priv->tid_req);
 379                        priv->tid_req.e.ack = &qp->s_ack_queue[i];
 380
 381                        ret = hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req,
 382                                                            GFP_KERNEL);
 383                        if (ret) {
 384                                kfree(priv);
 385                                return ret;
 386                        }
 387                        qp->s_ack_queue[i].priv = priv;
 388                }
 389        }
 390
 391        return 0;
 392}
 393
 394void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp)
 395{
 396        struct hfi1_qp_priv *qpriv = qp->priv;
 397        struct rvt_swqe *wqe;
 398        u32 i;
 399
 400        if (qp->ibqp.qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
 401                for (i = 0; i < qp->s_size; i++) {
 402                        wqe = rvt_get_swqe_ptr(qp, i);
 403                        kfree(wqe->priv);
 404                        wqe->priv = NULL;
 405                }
 406                for (i = 0; i < rvt_max_atomic(rdi); i++) {
 407                        struct hfi1_ack_priv *priv = qp->s_ack_queue[i].priv;
 408
 409                        if (priv)
 410                                hfi1_kern_exp_rcv_free_flows(&priv->tid_req);
 411                        kfree(priv);
 412                        qp->s_ack_queue[i].priv = NULL;
 413                }
 414                cancel_work_sync(&qpriv->opfn.opfn_work);
 415                kfree(qpriv->pages);
 416                qpriv->pages = NULL;
 417        }
 418}
 419
 420/* Flow and tid waiter functions */
 421/**
 422 * DOC: lock ordering
 423 *
 424 * There are two locks involved with the queuing
 425 * routines: the qp s_lock and the exp_lock.
 426 *
 427 * Since the tid space allocation is called from
 428 * the send engine, the qp s_lock is already held.
 429 *
 430 * The allocation routines will get the exp_lock.
 431 *
 432 * The first_qp() call is provided to allow the head of
 433 * the rcd wait queue to be fetched under the exp_lock and
 434 * followed by a drop of the exp_lock.
 435 *
 436 * Any qp in the wait list will have the qp reference count held
 437 * to hold the qp in memory.
 438 */
 439
 440/*
 441 * return head of rcd wait list
 442 *
 443 * Must hold the exp_lock.
 444 *
 445 * Get a reference to the QP to hold the QP in memory.
 446 *
 447 * The caller must release the reference when the local
 448 * is no longer being used.
 449 */
 450static struct rvt_qp *first_qp(struct hfi1_ctxtdata *rcd,
 451                               struct tid_queue *queue)
 452        __must_hold(&rcd->exp_lock)
 453{
 454        struct hfi1_qp_priv *priv;
 455
 456        lockdep_assert_held(&rcd->exp_lock);
 457        priv = list_first_entry_or_null(&queue->queue_head,
 458                                        struct hfi1_qp_priv,
 459                                        tid_wait);
 460        if (!priv)
 461                return NULL;
 462        rvt_get_qp(priv->owner);
 463        return priv->owner;
 464}
 465
 466/**
 467 * kernel_tid_waiters - determine rcd wait
 468 * @rcd: the receive context
 469 * @qp: the head of the qp being processed
 470 *
 471 * This routine will return false IFF
 472 * the list is NULL or the head of the
 473 * list is the indicated qp.
 474 *
 475 * Must hold the qp s_lock and the exp_lock.
 476 *
 477 * Return:
 478 * false if either of the conditions below are satisfied:
 479 * 1. The list is empty or
 480 * 2. The indicated qp is at the head of the list and the
 481 *    HFI1_S_WAIT_TID_SPACE bit is set in qp->s_flags.
 482 * true is returned otherwise.
 483 */
 484static bool kernel_tid_waiters(struct hfi1_ctxtdata *rcd,
 485                               struct tid_queue *queue, struct rvt_qp *qp)
 486        __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
 487{
 488        struct rvt_qp *fqp;
 489        bool ret = true;
 490
 491        lockdep_assert_held(&qp->s_lock);
 492        lockdep_assert_held(&rcd->exp_lock);
 493        fqp = first_qp(rcd, queue);
 494        if (!fqp || (fqp == qp && (qp->s_flags & HFI1_S_WAIT_TID_SPACE)))
 495                ret = false;
 496        rvt_put_qp(fqp);
 497        return ret;
 498}
 499
 500/**
 501 * dequeue_tid_waiter - dequeue the qp from the list
 502 * @qp - the qp to remove the wait list
 503 *
 504 * This routine removes the indicated qp from the
 505 * wait list if it is there.
 506 *
 507 * This should be done after the hardware flow and
 508 * tid array resources have been allocated.
 509 *
 510 * Must hold the qp s_lock and the rcd exp_lock.
 511 *
 512 * It assumes the s_lock to protect the s_flags
 513 * field and to reliably test the HFI1_S_WAIT_TID_SPACE flag.
 514 */
 515static void dequeue_tid_waiter(struct hfi1_ctxtdata *rcd,
 516                               struct tid_queue *queue, struct rvt_qp *qp)
 517        __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
 518{
 519        struct hfi1_qp_priv *priv = qp->priv;
 520
 521        lockdep_assert_held(&qp->s_lock);
 522        lockdep_assert_held(&rcd->exp_lock);
 523        if (list_empty(&priv->tid_wait))
 524                return;
 525        list_del_init(&priv->tid_wait);
 526        qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE;
 527        queue->dequeue++;
 528        rvt_put_qp(qp);
 529}
 530
 531/**
 532 * queue_qp_for_tid_wait - suspend QP on tid space
 533 * @rcd: the receive context
 534 * @qp: the qp
 535 *
 536 * The qp is inserted at the tail of the rcd
 537 * wait queue and the HFI1_S_WAIT_TID_SPACE s_flag is set.
 538 *
 539 * Must hold the qp s_lock and the exp_lock.
 540 */
 541static void queue_qp_for_tid_wait(struct hfi1_ctxtdata *rcd,
 542                                  struct tid_queue *queue, struct rvt_qp *qp)
 543        __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
 544{
 545        struct hfi1_qp_priv *priv = qp->priv;
 546
 547        lockdep_assert_held(&qp->s_lock);
 548        lockdep_assert_held(&rcd->exp_lock);
 549        if (list_empty(&priv->tid_wait)) {
 550                qp->s_flags |= HFI1_S_WAIT_TID_SPACE;
 551                list_add_tail(&priv->tid_wait, &queue->queue_head);
 552                priv->tid_enqueue = ++queue->enqueue;
 553                rcd->dd->verbs_dev.n_tidwait++;
 554                trace_hfi1_qpsleep(qp, HFI1_S_WAIT_TID_SPACE);
 555                rvt_get_qp(qp);
 556        }
 557}
 558
 559/**
 560 * __trigger_tid_waiter - trigger tid waiter
 561 * @qp: the qp
 562 *
 563 * This is a private entrance to schedule the qp
 564 * assuming the caller is holding the qp->s_lock.
 565 */
 566static void __trigger_tid_waiter(struct rvt_qp *qp)
 567        __must_hold(&qp->s_lock)
 568{
 569        lockdep_assert_held(&qp->s_lock);
 570        if (!(qp->s_flags & HFI1_S_WAIT_TID_SPACE))
 571                return;
 572        trace_hfi1_qpwakeup(qp, HFI1_S_WAIT_TID_SPACE);
 573        hfi1_schedule_send(qp);
 574}
 575
 576/**
 577 * tid_rdma_schedule_tid_wakeup - schedule wakeup for a qp
 578 * @qp - the qp
 579 *
 580 * trigger a schedule or a waiting qp in a deadlock
 581 * safe manner.  The qp reference is held prior
 582 * to this call via first_qp().
 583 *
 584 * If the qp trigger was already scheduled (!rval)
 585 * the the reference is dropped, otherwise the resume
 586 * or the destroy cancel will dispatch the reference.
 587 */
 588static void tid_rdma_schedule_tid_wakeup(struct rvt_qp *qp)
 589{
 590        struct hfi1_qp_priv *priv;
 591        struct hfi1_ibport *ibp;
 592        struct hfi1_pportdata *ppd;
 593        struct hfi1_devdata *dd;
 594        bool rval;
 595
 596        if (!qp)
 597                return;
 598
 599        priv = qp->priv;
 600        ibp = to_iport(qp->ibqp.device, qp->port_num);
 601        ppd = ppd_from_ibp(ibp);
 602        dd = dd_from_ibdev(qp->ibqp.device);
 603
 604        rval = queue_work_on(priv->s_sde ?
 605                             priv->s_sde->cpu :
 606                             cpumask_first(cpumask_of_node(dd->node)),
 607                             ppd->hfi1_wq,
 608                             &priv->tid_rdma.trigger_work);
 609        if (!rval)
 610                rvt_put_qp(qp);
 611}
 612
 613/**
 614 * tid_rdma_trigger_resume - field a trigger work request
 615 * @work - the work item
 616 *
 617 * Complete the off qp trigger processing by directly
 618 * calling the progress routine.
 619 */
 620static void tid_rdma_trigger_resume(struct work_struct *work)
 621{
 622        struct tid_rdma_qp_params *tr;
 623        struct hfi1_qp_priv *priv;
 624        struct rvt_qp *qp;
 625
 626        tr = container_of(work, struct tid_rdma_qp_params, trigger_work);
 627        priv = container_of(tr, struct hfi1_qp_priv, tid_rdma);
 628        qp = priv->owner;
 629        spin_lock_irq(&qp->s_lock);
 630        if (qp->s_flags & HFI1_S_WAIT_TID_SPACE) {
 631                spin_unlock_irq(&qp->s_lock);
 632                hfi1_do_send(priv->owner, true);
 633        } else {
 634                spin_unlock_irq(&qp->s_lock);
 635        }
 636        rvt_put_qp(qp);
 637}
 638
 639/**
 640 * tid_rdma_flush_wait - unwind any tid space wait
 641 *
 642 * This is called when resetting a qp to
 643 * allow a destroy or reset to get rid
 644 * of any tid space linkage and reference counts.
 645 */
 646static void _tid_rdma_flush_wait(struct rvt_qp *qp, struct tid_queue *queue)
 647        __must_hold(&qp->s_lock)
 648{
 649        struct hfi1_qp_priv *priv;
 650
 651        if (!qp)
 652                return;
 653        lockdep_assert_held(&qp->s_lock);
 654        priv = qp->priv;
 655        qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE;
 656        spin_lock(&priv->rcd->exp_lock);
 657        if (!list_empty(&priv->tid_wait)) {
 658                list_del_init(&priv->tid_wait);
 659                qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE;
 660                queue->dequeue++;
 661                rvt_put_qp(qp);
 662        }
 663        spin_unlock(&priv->rcd->exp_lock);
 664}
 665
 666void hfi1_tid_rdma_flush_wait(struct rvt_qp *qp)
 667        __must_hold(&qp->s_lock)
 668{
 669        struct hfi1_qp_priv *priv = qp->priv;
 670
 671        _tid_rdma_flush_wait(qp, &priv->rcd->flow_queue);
 672        _tid_rdma_flush_wait(qp, &priv->rcd->rarr_queue);
 673}
 674
 675/* Flow functions */
 676/**
 677 * kern_reserve_flow - allocate a hardware flow
 678 * @rcd - the context to use for allocation
 679 * @last - the index of the preferred flow. Use RXE_NUM_TID_FLOWS to
 680 *         signify "don't care".
 681 *
 682 * Use a bit mask based allocation to reserve a hardware
 683 * flow for use in receiving KDETH data packets. If a preferred flow is
 684 * specified the function will attempt to reserve that flow again, if
 685 * available.
 686 *
 687 * The exp_lock must be held.
 688 *
 689 * Return:
 690 * On success: a value postive value between 0 and RXE_NUM_TID_FLOWS - 1
 691 * On failure: -EAGAIN
 692 */
 693static int kern_reserve_flow(struct hfi1_ctxtdata *rcd, int last)
 694        __must_hold(&rcd->exp_lock)
 695{
 696        int nr;
 697
 698        /* Attempt to reserve the preferred flow index */
 699        if (last >= 0 && last < RXE_NUM_TID_FLOWS &&
 700            !test_and_set_bit(last, &rcd->flow_mask))
 701                return last;
 702
 703        nr = ffz(rcd->flow_mask);
 704        BUILD_BUG_ON(RXE_NUM_TID_FLOWS >=
 705                     (sizeof(rcd->flow_mask) * BITS_PER_BYTE));
 706        if (nr > (RXE_NUM_TID_FLOWS - 1))
 707                return -EAGAIN;
 708        set_bit(nr, &rcd->flow_mask);
 709        return nr;
 710}
 711
 712static void kern_set_hw_flow(struct hfi1_ctxtdata *rcd, u32 generation,
 713                             u32 flow_idx)
 714{
 715        u64 reg;
 716
 717        reg = ((u64)generation << HFI1_KDETH_BTH_SEQ_SHIFT) |
 718                RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK |
 719                RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK |
 720                RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK |
 721                RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK |
 722                RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK;
 723
 724        if (generation != KERN_GENERATION_RESERVED)
 725                reg |= RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK;
 726
 727        write_uctxt_csr(rcd->dd, rcd->ctxt,
 728                        RCV_TID_FLOW_TABLE + 8 * flow_idx, reg);
 729}
 730
 731static u32 kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx)
 732        __must_hold(&rcd->exp_lock)
 733{
 734        u32 generation = rcd->flows[flow_idx].generation;
 735
 736        kern_set_hw_flow(rcd, generation, flow_idx);
 737        return generation;
 738}
 739
 740static u32 kern_flow_generation_next(u32 gen)
 741{
 742        u32 generation = mask_generation(gen + 1);
 743
 744        if (generation == KERN_GENERATION_RESERVED)
 745                generation = mask_generation(generation + 1);
 746        return generation;
 747}
 748
 749static void kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx)
 750        __must_hold(&rcd->exp_lock)
 751{
 752        rcd->flows[flow_idx].generation =
 753                kern_flow_generation_next(rcd->flows[flow_idx].generation);
 754        kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, flow_idx);
 755}
 756
 757int hfi1_kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp)
 758{
 759        struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv;
 760        struct tid_flow_state *fs = &qpriv->flow_state;
 761        struct rvt_qp *fqp;
 762        unsigned long flags;
 763        int ret = 0;
 764
 765        /* The QP already has an allocated flow */
 766        if (fs->index != RXE_NUM_TID_FLOWS)
 767                return ret;
 768
 769        spin_lock_irqsave(&rcd->exp_lock, flags);
 770        if (kernel_tid_waiters(rcd, &rcd->flow_queue, qp))
 771                goto queue;
 772
 773        ret = kern_reserve_flow(rcd, fs->last_index);
 774        if (ret < 0)
 775                goto queue;
 776        fs->index = ret;
 777        fs->last_index = fs->index;
 778
 779        /* Generation received in a RESYNC overrides default flow generation */
 780        if (fs->generation != KERN_GENERATION_RESERVED)
 781                rcd->flows[fs->index].generation = fs->generation;
 782        fs->generation = kern_setup_hw_flow(rcd, fs->index);
 783        fs->psn = 0;
 784        dequeue_tid_waiter(rcd, &rcd->flow_queue, qp);
 785        /* get head before dropping lock */
 786        fqp = first_qp(rcd, &rcd->flow_queue);
 787        spin_unlock_irqrestore(&rcd->exp_lock, flags);
 788
 789        tid_rdma_schedule_tid_wakeup(fqp);
 790        return 0;
 791queue:
 792        queue_qp_for_tid_wait(rcd, &rcd->flow_queue, qp);
 793        spin_unlock_irqrestore(&rcd->exp_lock, flags);
 794        return -EAGAIN;
 795}
 796
 797void hfi1_kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp)
 798{
 799        struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv;
 800        struct tid_flow_state *fs = &qpriv->flow_state;
 801        struct rvt_qp *fqp;
 802        unsigned long flags;
 803
 804        if (fs->index >= RXE_NUM_TID_FLOWS)
 805                return;
 806        spin_lock_irqsave(&rcd->exp_lock, flags);
 807        kern_clear_hw_flow(rcd, fs->index);
 808        clear_bit(fs->index, &rcd->flow_mask);
 809        fs->index = RXE_NUM_TID_FLOWS;
 810        fs->psn = 0;
 811        fs->generation = KERN_GENERATION_RESERVED;
 812
 813        /* get head before dropping lock */
 814        fqp = first_qp(rcd, &rcd->flow_queue);
 815        spin_unlock_irqrestore(&rcd->exp_lock, flags);
 816
 817        if (fqp == qp) {
 818                __trigger_tid_waiter(fqp);
 819                rvt_put_qp(fqp);
 820        } else {
 821                tid_rdma_schedule_tid_wakeup(fqp);
 822        }
 823}
 824
 825void hfi1_kern_init_ctxt_generations(struct hfi1_ctxtdata *rcd)
 826{
 827        int i;
 828
 829        for (i = 0; i < RXE_NUM_TID_FLOWS; i++) {
 830                rcd->flows[i].generation = mask_generation(prandom_u32());
 831                kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, i);
 832        }
 833}
 834
 835/* TID allocation functions */
 836static u8 trdma_pset_order(struct tid_rdma_pageset *s)
 837{
 838        u8 count = s->count;
 839
 840        return ilog2(count) + 1;
 841}
 842
 843/**
 844 * tid_rdma_find_phys_blocks_4k - get groups base on mr info
 845 * @npages - number of pages
 846 * @pages - pointer to an array of page structs
 847 * @list - page set array to return
 848 *
 849 * This routine returns the number of groups associated with
 850 * the current sge information.  This implementation is based
 851 * on the expected receive find_phys_blocks() adjusted to
 852 * use the MR information vs. the pfn.
 853 *
 854 * Return:
 855 * the number of RcvArray entries
 856 */
 857static u32 tid_rdma_find_phys_blocks_4k(struct tid_rdma_flow *flow,
 858                                        struct page **pages,
 859                                        u32 npages,
 860                                        struct tid_rdma_pageset *list)
 861{
 862        u32 pagecount, pageidx, setcount = 0, i;
 863        void *vaddr, *this_vaddr;
 864
 865        if (!npages)
 866                return 0;
 867
 868        /*
 869         * Look for sets of physically contiguous pages in the user buffer.
 870         * This will allow us to optimize Expected RcvArray entry usage by
 871         * using the bigger supported sizes.
 872         */
 873        vaddr = page_address(pages[0]);
 874        trace_hfi1_tid_flow_page(flow->req->qp, flow, 0, 0, 0, vaddr);
 875        for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
 876                this_vaddr = i < npages ? page_address(pages[i]) : NULL;
 877                trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 0, 0,
 878                                         this_vaddr);
 879                /*
 880                 * If the vaddr's are not sequential, pages are not physically
 881                 * contiguous.
 882                 */
 883                if (this_vaddr != (vaddr + PAGE_SIZE)) {
 884                        /*
 885                         * At this point we have to loop over the set of
 886                         * physically contiguous pages and break them down it
 887                         * sizes supported by the HW.
 888                         * There are two main constraints:
 889                         *     1. The max buffer size is MAX_EXPECTED_BUFFER.
 890                         *        If the total set size is bigger than that
 891                         *        program only a MAX_EXPECTED_BUFFER chunk.
 892                         *     2. The buffer size has to be a power of two. If
 893                         *        it is not, round down to the closes power of
 894                         *        2 and program that size.
 895                         */
 896                        while (pagecount) {
 897                                int maxpages = pagecount;
 898                                u32 bufsize = pagecount * PAGE_SIZE;
 899
 900                                if (bufsize > MAX_EXPECTED_BUFFER)
 901                                        maxpages =
 902                                                MAX_EXPECTED_BUFFER >>
 903                                                PAGE_SHIFT;
 904                                else if (!is_power_of_2(bufsize))
 905                                        maxpages =
 906                                                rounddown_pow_of_two(bufsize) >>
 907                                                PAGE_SHIFT;
 908
 909                                list[setcount].idx = pageidx;
 910                                list[setcount].count = maxpages;
 911                                trace_hfi1_tid_pageset(flow->req->qp, setcount,
 912                                                       list[setcount].idx,
 913                                                       list[setcount].count);
 914                                pagecount -= maxpages;
 915                                pageidx += maxpages;
 916                                setcount++;
 917                        }
 918                        pageidx = i;
 919                        pagecount = 1;
 920                        vaddr = this_vaddr;
 921                } else {
 922                        vaddr += PAGE_SIZE;
 923                        pagecount++;
 924                }
 925        }
 926        /* insure we always return an even number of sets */
 927        if (setcount & 1)
 928                list[setcount++].count = 0;
 929        return setcount;
 930}
 931
 932/**
 933 * tid_flush_pages - dump out pages into pagesets
 934 * @list - list of pagesets
 935 * @idx - pointer to current page index
 936 * @pages - number of pages to dump
 937 * @sets - current number of pagesset
 938 *
 939 * This routine flushes out accumuated pages.
 940 *
 941 * To insure an even number of sets the
 942 * code may add a filler.
 943 *
 944 * This can happen with when pages is not
 945 * a power of 2 or pages is a power of 2
 946 * less than the maximum pages.
 947 *
 948 * Return:
 949 * The new number of sets
 950 */
 951
 952static u32 tid_flush_pages(struct tid_rdma_pageset *list,
 953                           u32 *idx, u32 pages, u32 sets)
 954{
 955        while (pages) {
 956                u32 maxpages = pages;
 957
 958                if (maxpages > MAX_EXPECTED_PAGES)
 959                        maxpages = MAX_EXPECTED_PAGES;
 960                else if (!is_power_of_2(maxpages))
 961                        maxpages = rounddown_pow_of_two(maxpages);
 962                list[sets].idx = *idx;
 963                list[sets++].count = maxpages;
 964                *idx += maxpages;
 965                pages -= maxpages;
 966        }
 967        /* might need a filler */
 968        if (sets & 1)
 969                list[sets++].count = 0;
 970        return sets;
 971}
 972
 973/**
 974 * tid_rdma_find_phys_blocks_8k - get groups base on mr info
 975 * @pages - pointer to an array of page structs
 976 * @npages - number of pages
 977 * @list - page set array to return
 978 *
 979 * This routine parses an array of pages to compute pagesets
 980 * in an 8k compatible way.
 981 *
 982 * pages are tested two at a time, i, i + 1 for contiguous
 983 * pages and i - 1 and i contiguous pages.
 984 *
 985 * If any condition is false, any accumlated pages are flushed and
 986 * v0,v1 are emitted as separate PAGE_SIZE pagesets
 987 *
 988 * Otherwise, the current 8k is totaled for a future flush.
 989 *
 990 * Return:
 991 * The number of pagesets
 992 * list set with the returned number of pagesets
 993 *
 994 */
 995static u32 tid_rdma_find_phys_blocks_8k(struct tid_rdma_flow *flow,
 996                                        struct page **pages,
 997                                        u32 npages,
 998                                        struct tid_rdma_pageset *list)
 999{
1000        u32 idx, sets = 0, i;

1001        u32 pagecnt = 0;
1002        void *v0, *v1, *vm1;
1003
1004        if (!npages)
1005                return 0;
1006        for (idx = 0, i = 0, vm1 = NULL; i < npages; i += 2) {
1007                /* get a new v0 */
1008                v0 = page_address(pages[i]);
1009                trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 0, v0);
1010                v1 = i + 1 < npages ?
1011                                page_address(pages[i + 1]) : NULL;
1012                trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 1, v1);
1013                /* compare i, i + 1 vaddr */
1014                if (v1 != (v0 + PAGE_SIZE)) {
1015                        /* flush out pages */
1016                        sets = tid_flush_pages(list, &idx, pagecnt, sets);
1017                        /* output v0,v1 as two pagesets */
1018                        list[sets].idx = idx++;
1019                        list[sets++].count = 1;
1020                        if (v1) {
1021                                list[sets].count = 1;
1022                                list[sets++].idx = idx++;
1023                        } else {
1024                                list[sets++].count = 0;
1025                        }
1026                        vm1 = NULL;
1027                        pagecnt = 0;
1028                        continue;
1029                }
1030                /* i,i+1 consecutive, look at i-1,i */
1031                if (vm1 && v0 != (vm1 + PAGE_SIZE)) {
1032                        /* flush out pages */
1033                        sets = tid_flush_pages(list, &idx, pagecnt, sets);
1034                        pagecnt = 0;
1035                }
1036                /* pages will always be a multiple of 8k */
1037                pagecnt += 2;
1038                /* save i-1 */
1039                vm1 = v1;
1040                /* move to next pair */
1041        }
1042        /* dump residual pages at end */
1043        sets = tid_flush_pages(list, &idx, npages - idx, sets);
1044        /* by design cannot be odd sets */
1045        WARN_ON(sets & 1);
1046        return sets;
1047}
1048
1049/**
1050 * Find pages for one segment of a sge array represented by @ss. The function
1051 * does not check the sge, the sge must have been checked for alignment with a
1052 * prior call to hfi1_kern_trdma_ok. Other sge checking is done as part of
1053 * rvt_lkey_ok and rvt_rkey_ok. Also, the function only modifies the local sge
1054 * copy maintained in @ss->sge, the original sge is not modified.
1055 *
1056 * Unlike IB RDMA WRITE, we can't decrement ss->num_sge here because we are not
1057 * releasing the MR reference count at the same time. Otherwise, we'll "leak"
1058 * references to the MR. This difference requires that we keep track of progress
1059 * into the sg_list. This is done by the cur_seg cursor in the tid_rdma_request
1060 * structure.
1061 */
1062static u32 kern_find_pages(struct tid_rdma_flow *flow,
1063                           struct page **pages,
1064                           struct rvt_sge_state *ss, bool *last)
1065{
1066        struct tid_rdma_request *req = flow->req;
1067        struct rvt_sge *sge = &ss->sge;
1068        u32 length = flow->req->seg_len;
1069        u32 len = PAGE_SIZE;
1070        u32 i = 0;
1071
1072        while (length && req->isge < ss->num_sge) {
1073                pages[i++] = virt_to_page(sge->vaddr);
1074
1075                sge->vaddr += len;
1076                sge->length -= len;
1077                sge->sge_length -= len;
1078                if (!sge->sge_length) {
1079                        if (++req->isge < ss->num_sge)
1080                                *sge = ss->sg_list[req->isge - 1];
1081                } else if (sge->length == 0 && sge->mr->lkey) {
1082                        if (++sge->n >= RVT_SEGSZ) {
1083                                ++sge->m;
1084                                sge->n = 0;
1085                        }
1086                        sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
1087                        sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
1088                }
1089                length -= len;
1090        }
1091
1092        flow->length = flow->req->seg_len - length;
1093        *last = req->isge == ss->num_sge ? false : true;
1094        return i;
1095}
1096
1097static void dma_unmap_flow(struct tid_rdma_flow *flow)
1098{
1099        struct hfi1_devdata *dd;
1100        int i;
1101        struct tid_rdma_pageset *pset;
1102
1103        dd = flow->req->rcd->dd;
1104        for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets;
1105                        i++, pset++) {
1106                if (pset->count && pset->addr) {
1107                        dma_unmap_page(&dd->pcidev->dev,
1108                                       pset->addr,
1109                                       PAGE_SIZE * pset->count,
1110                                       DMA_FROM_DEVICE);
1111                        pset->mapped = 0;
1112                }
1113        }
1114}
1115
1116static int dma_map_flow(struct tid_rdma_flow *flow, struct page **pages)
1117{
1118        int i;
1119        struct hfi1_devdata *dd = flow->req->rcd->dd;
1120        struct tid_rdma_pageset *pset;
1121
1122        for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets;
1123                        i++, pset++) {
1124                if (pset->count) {
1125                        pset->addr = dma_map_page(&dd->pcidev->dev,
1126                                                  pages[pset->idx],
1127                                                  0,
1128                                                  PAGE_SIZE * pset->count,
1129                                                  DMA_FROM_DEVICE);
1130
1131                        if (dma_mapping_error(&dd->pcidev->dev, pset->addr)) {
1132                                dma_unmap_flow(flow);
1133                                return -ENOMEM;
1134                        }
1135                        pset->mapped = 1;
1136                }
1137        }
1138        return 0;
1139}
1140
1141static inline bool dma_mapped(struct tid_rdma_flow *flow)
1142{
1143        return !!flow->pagesets[0].mapped;
1144}
1145
1146/*
1147 * Get pages pointers and identify contiguous physical memory chunks for a
1148 * segment. All segments are of length flow->req->seg_len.
1149 */
1150static int kern_get_phys_blocks(struct tid_rdma_flow *flow,
1151                                struct page **pages,
1152                                struct rvt_sge_state *ss, bool *last)
1153{
1154        u8 npages;
1155
1156        /* Reuse previously computed pagesets, if any */
1157        if (flow->npagesets) {
1158                trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head,
1159                                          flow);
1160                if (!dma_mapped(flow))
1161                        return dma_map_flow(flow, pages);
1162                return 0;
1163        }
1164
1165        npages = kern_find_pages(flow, pages, ss, last);
1166
1167        if (flow->req->qp->pmtu == enum_to_mtu(OPA_MTU_4096))
1168                flow->npagesets =
1169                        tid_rdma_find_phys_blocks_4k(flow, pages, npages,
1170                                                     flow->pagesets);
1171        else
1172                flow->npagesets =
1173                        tid_rdma_find_phys_blocks_8k(flow, pages, npages,
1174                                                     flow->pagesets);
1175
1176        return dma_map_flow(flow, pages);
1177}
1178
1179static inline void kern_add_tid_node(struct tid_rdma_flow *flow,
1180                                     struct hfi1_ctxtdata *rcd, char *s,
1181                                     struct tid_group *grp, u8 cnt)
1182{
1183        struct kern_tid_node *node = &flow->tnode[flow->tnode_cnt++];
1184
1185        WARN_ON_ONCE(flow->tnode_cnt >=
1186                     (TID_RDMA_MAX_SEGMENT_SIZE >> PAGE_SHIFT));
1187        if (WARN_ON_ONCE(cnt & 1))
1188                dd_dev_err(rcd->dd,
1189                           "unexpected odd allocation cnt %u map 0x%x used %u",
1190                           cnt, grp->map, grp->used);
1191
1192        node->grp = grp;
1193        node->map = grp->map;
1194        node->cnt = cnt;
1195        trace_hfi1_tid_node_add(flow->req->qp, s, flow->tnode_cnt - 1,
1196                                grp->base, grp->map, grp->used, cnt);
1197}
1198
1199/*
1200 * Try to allocate pageset_count TID's from TID groups for a context
1201 *
1202 * This function allocates TID's without moving groups between lists or
1203 * modifying grp->map. This is done as follows, being cogizant of the lists
1204 * between which the TID groups will move:
1205 * 1. First allocate complete groups of 8 TID's since this is more efficient,
1206 *    these groups will move from group->full without affecting used
1207 * 2. If more TID's are needed allocate from used (will move from used->full or
1208 *    stay in used)
1209 * 3. If we still don't have the required number of TID's go back and look again
1210 *    at a complete group (will move from group->used)
1211 */
1212static int kern_alloc_tids(struct tid_rdma_flow *flow)
1213{
1214        struct hfi1_ctxtdata *rcd = flow->req->rcd;
1215        struct hfi1_devdata *dd = rcd->dd;
1216        u32 ngroups, pageidx = 0;
1217        struct tid_group *group = NULL, *used;
1218        u8 use;
1219
1220        flow->tnode_cnt = 0;
1221        ngroups = flow->npagesets / dd->rcv_entries.group_size;
1222        if (!ngroups)
1223                goto used_list;
1224
1225        /* First look at complete groups */
1226        list_for_each_entry(group,  &rcd->tid_group_list.list, list) {
1227                kern_add_tid_node(flow, rcd, "complete groups", group,
1228                                  group->size);
1229
1230                pageidx += group->size;
1231                if (!--ngroups)
1232                        break;
1233        }
1234
1235        if (pageidx >= flow->npagesets)
1236                goto ok;
1237
1238used_list:
1239        /* Now look at partially used groups */
1240        list_for_each_entry(used, &rcd->tid_used_list.list, list) {
1241                use = min_t(u32, flow->npagesets - pageidx,
1242                            used->size - used->used);
1243                kern_add_tid_node(flow, rcd, "used groups", used, use);
1244
1245                pageidx += use;
1246                if (pageidx >= flow->npagesets)
1247                        goto ok;
1248        }
1249
1250        /*
1251         * Look again at a complete group, continuing from where we left.
1252         * However, if we are at the head, we have reached the end of the
1253         * complete groups list from the first loop above
1254         */
1255        if (group && &group->list == &rcd->tid_group_list.list)
1256                goto bail_eagain;
1257        group = list_prepare_entry(group, &rcd->tid_group_list.list,
1258                                   list);
1259        if (list_is_last(&group->list, &rcd->tid_group_list.list))
1260                goto bail_eagain;
1261        group = list_next_entry(group, list);
1262        use = min_t(u32, flow->npagesets - pageidx, group->size);
1263        kern_add_tid_node(flow, rcd, "complete continue", group, use);
1264        pageidx += use;
1265        if (pageidx >= flow->npagesets)
1266                goto ok;
1267bail_eagain:
1268        trace_hfi1_msg_alloc_tids(flow->req->qp, " insufficient tids: needed ",
1269                                  (u64)flow->npagesets);
1270        return -EAGAIN;
1271ok:
1272        return 0;
1273}
1274
1275static void kern_program_rcv_group(struct tid_rdma_flow *flow, int grp_num,
1276                                   u32 *pset_idx)
1277{
1278        struct hfi1_ctxtdata *rcd = flow->req->rcd;
1279        struct hfi1_devdata *dd = rcd->dd;
1280        struct kern_tid_node *node = &flow->tnode[grp_num];
1281        struct tid_group *grp = node->grp;
1282        struct tid_rdma_pageset *pset;
1283        u32 pmtu_pg = flow->req->qp->pmtu >> PAGE_SHIFT;
1284        u32 rcventry, npages = 0, pair = 0, tidctrl;
1285        u8 i, cnt = 0;
1286
1287        for (i = 0; i < grp->size; i++) {
1288                rcventry = grp->base + i;
1289
1290                if (node->map & BIT(i) || cnt >= node->cnt) {
1291                        rcv_array_wc_fill(dd, rcventry);
1292                        continue;
1293                }
1294                pset = &flow->pagesets[(*pset_idx)++];
1295                if (pset->count) {
1296                        hfi1_put_tid(dd, rcventry, PT_EXPECTED,
1297                                     pset->addr, trdma_pset_order(pset));
1298                } else {
1299                        hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0);
1300                }
1301                npages += pset->count;
1302
1303                rcventry -= rcd->expected_base;
1304                tidctrl = pair ? 0x3 : rcventry & 0x1 ? 0x2 : 0x1;
1305                /*
1306                 * A single TID entry will be used to use a rcvarr pair (with
1307                 * tidctrl 0x3), if ALL these are true (a) the bit pos is even
1308                 * (b) the group map shows current and the next bits as free
1309                 * indicating two consecutive rcvarry entries are available (c)
1310                 * we actually need 2 more entries
1311                 */
1312                pair = !(i & 0x1) && !((node->map >> i) & 0x3) &&
1313                        node->cnt >= cnt + 2;
1314                if (!pair) {
1315                        if (!pset->count)
1316                                tidctrl = 0x1;
1317                        flow->tid_entry[flow->tidcnt++] =
1318                                EXP_TID_SET(IDX, rcventry >> 1) |
1319                                EXP_TID_SET(CTRL, tidctrl) |
1320                                EXP_TID_SET(LEN, npages);
1321                        trace_hfi1_tid_entry_alloc(/* entry */
1322                           flow->req->qp, flow->tidcnt - 1,
1323                           flow->tid_entry[flow->tidcnt - 1]);
1324
1325                        /* Efficient DIV_ROUND_UP(npages, pmtu_pg) */
1326                        flow->npkts += (npages + pmtu_pg - 1) >> ilog2(pmtu_pg);
1327                        npages = 0;
1328                }
1329
1330                if (grp->used == grp->size - 1)
1331                        tid_group_move(grp, &rcd->tid_used_list,
1332                                       &rcd->tid_full_list);
1333                else if (!grp->used)
1334                        tid_group_move(grp, &rcd->tid_group_list,
1335                                       &rcd->tid_used_list);
1336
1337                grp->used++;
1338                grp->map |= BIT(i);
1339                cnt++;
1340        }
1341}
1342
1343static void kern_unprogram_rcv_group(struct tid_rdma_flow *flow, int grp_num)
1344{
1345        struct hfi1_ctxtdata *rcd = flow->req->rcd;
1346        struct hfi1_devdata *dd = rcd->dd;
1347        struct kern_tid_node *node = &flow->tnode[grp_num];
1348        struct tid_group *grp = node->grp;
1349        u32 rcventry;
1350        u8 i, cnt = 0;
1351
1352        for (i = 0; i < grp->size; i++) {
1353                rcventry = grp->base + i;
1354
1355                if (node->map & BIT(i) || cnt >= node->cnt) {
1356                        rcv_array_wc_fill(dd, rcventry);
1357                        continue;
1358                }
1359
1360                hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0);
1361
1362                grp->used--;
1363                grp->map &= ~BIT(i);
1364                cnt++;
1365
1366                if (grp->used == grp->size - 1)
1367                        tid_group_move(grp, &rcd->tid_full_list,
1368                                       &rcd->tid_used_list);
1369                else if (!grp->used)
1370                        tid_group_move(grp, &rcd->tid_used_list,
1371                                       &rcd->tid_group_list);
1372        }
1373        if (WARN_ON_ONCE(cnt & 1)) {
1374                struct hfi1_ctxtdata *rcd = flow->req->rcd;
1375                struct hfi1_devdata *dd = rcd->dd;
1376
1377                dd_dev_err(dd, "unexpected odd free cnt %u map 0x%x used %u",
1378                           cnt, grp->map, grp->used);
1379        }
1380}
1381
1382static void kern_program_rcvarray(struct tid_rdma_flow *flow)
1383{
1384        u32 pset_idx = 0;
1385        int i;
1386
1387        flow->npkts = 0;
1388        flow->tidcnt = 0;
1389        for (i = 0; i < flow->tnode_cnt; i++)
1390                kern_program_rcv_group(flow, i, &pset_idx);
1391        trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head, flow);
1392}
1393
1394/**
1395 * hfi1_kern_exp_rcv_setup() - setup TID's and flow for one segment of a
1396 * TID RDMA request
1397 *
1398 * @req: TID RDMA request for which the segment/flow is being set up
1399 * @ss: sge state, maintains state across successive segments of a sge
1400 * @last: set to true after the last sge segment has been processed
1401 *
1402 * This function
1403 * (1) finds a free flow entry in the flow circular buffer
1404 * (2) finds pages and continuous physical chunks constituing one segment
1405 *     of an sge
1406 * (3) allocates TID group entries for those chunks
1407 * (4) programs rcvarray entries in the hardware corresponding to those
1408 *     TID's
1409 * (5) computes a tidarray with formatted TID entries which can be sent
1410 *     to the sender
1411 * (6) Reserves and programs HW flows.
1412 * (7) It also manages queing the QP when TID/flow resources are not
1413 *     available.
1414 *
1415 * @req points to struct tid_rdma_request of which the segments are a part. The
1416 * function uses qp, rcd and seg_len members of @req. In the absence of errors,
1417 * req->flow_idx is the index of the flow which has been prepared in this
1418 * invocation of function call. With flow = &req->flows[req->flow_idx],
1419 * flow->tid_entry contains the TID array which the sender can use for TID RDMA
1420 * sends and flow->npkts contains number of packets required to send the
1421 * segment.
1422 *
1423 * hfi1_check_sge_align should be called prior to calling this function and if
1424 * it signals error TID RDMA cannot be used for this sge and this function
1425 * should not be called.
1426 *
1427 * For the queuing, caller must hold the flow->req->qp s_lock from the send
1428 * engine and the function will procure the exp_lock.
1429 *
1430 * Return:
1431 * The function returns -EAGAIN if sufficient number of TID/flow resources to
1432 * map the segment could not be allocated. In this case the function should be
1433 * called again with previous arguments to retry the TID allocation. There are
1434 * no other error returns. The function returns 0 on success.
1435 */
1436int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req,
1437                            struct rvt_sge_state *ss, bool *last)
1438        __must_hold(&req->qp->s_lock)
1439{
1440        struct tid_rdma_flow *flow = &req->flows[req->setup_head];
1441        struct hfi1_ctxtdata *rcd = req->rcd;
1442        struct hfi1_qp_priv *qpriv = req->qp->priv;
1443        unsigned long flags;
1444        struct rvt_qp *fqp;
1445        u16 clear_tail = req->clear_tail;
1446
1447        lockdep_assert_held(&req->qp->s_lock);
1448        /*
1449         * We return error if either (a) we don't have space in the flow
1450         * circular buffer, or (b) we already have max entries in the buffer.
1451         * Max entries depend on the type of request we are processing and the
1452         * negotiated TID RDMA parameters.
1453         */
1454        if (!CIRC_SPACE(req->setup_head, clear_tail, MAX_FLOWS) ||
1455            CIRC_CNT(req->setup_head, clear_tail, MAX_FLOWS) >=
1456            req->n_flows)
1457                return -EINVAL;
1458
1459        /*
1460         * Get pages, identify contiguous physical memory chunks for the segment
1461         * If we can not determine a DMA address mapping we will treat it just
1462         * like if we ran out of space above.
1463         */
1464        if (kern_get_phys_blocks(flow, qpriv->pages, ss, last)) {
1465                hfi1_wait_kmem(flow->req->qp);
1466                return -ENOMEM;
1467        }
1468
1469        spin_lock_irqsave(&rcd->exp_lock, flags);
1470        if (kernel_tid_waiters(rcd, &rcd->rarr_queue, flow->req->qp))
1471                goto queue;
1472
1473        /*
1474         * At this point we know the number of pagesets and hence the number of
1475         * TID's to map the segment. Allocate the TID's from the TID groups. If
1476         * we cannot allocate the required number we exit and try again later
1477         */
1478        if (kern_alloc_tids(flow))
1479                goto queue;
1480        /*
1481         * Finally program the TID entries with the pagesets, compute the
1482         * tidarray and enable the HW flow
1483         */
1484        kern_program_rcvarray(flow);
1485
1486        /*
1487         * Setup the flow state with relevant information.
1488         * This information is used for tracking the sequence of data packets
1489         * for the segment.
1490         * The flow is setup here as this is the most accurate time and place
1491         * to do so. Doing at a later time runs the risk of the flow data in
1492         * qpriv getting out of sync.
1493         */
1494        memset(&flow->flow_state, 0x0, sizeof(flow->flow_state));
1495        flow->idx = qpriv->flow_state.index;
1496        flow->flow_state.generation = qpriv->flow_state.generation;
1497        flow->flow_state.spsn = qpriv->flow_state.psn;
1498        flow->flow_state.lpsn = flow->flow_state.spsn + flow->npkts - 1;
1499        flow->flow_state.r_next_psn =
1500                full_flow_psn(flow, flow->flow_state.spsn);
1501        qpriv->flow_state.psn += flow->npkts;
1502
1503        dequeue_tid_waiter(rcd, &rcd->rarr_queue, flow->req->qp);
1504        /* get head before dropping lock */
1505        fqp = first_qp(rcd, &rcd->rarr_queue);
1506        spin_unlock_irqrestore(&rcd->exp_lock, flags);
1507        tid_rdma_schedule_tid_wakeup(fqp);
1508
1509        req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1);
1510        return 0;
1511queue:
1512        queue_qp_for_tid_wait(rcd, &rcd->rarr_queue, flow->req->qp);
1513        spin_unlock_irqrestore(&rcd->exp_lock, flags);
1514        return -EAGAIN;
1515}
1516
1517static void hfi1_tid_rdma_reset_flow(struct tid_rdma_flow *flow)
1518{
1519        flow->npagesets = 0;
1520}
1521
1522/*
1523 * This function is called after one segment has been successfully sent to
1524 * release the flow and TID HW/SW resources for that segment. The segments for a
1525 * TID RDMA request are setup and cleared in FIFO order which is managed using a
1526 * circular buffer.
1527 */
1528int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req)
1529        __must_hold(&req->qp->s_lock)
1530{
1531        struct tid_rdma_flow *flow = &req->flows[req->clear_tail];
1532        struct hfi1_ctxtdata *rcd = req->rcd;
1533        unsigned long flags;
1534        int i;
1535        struct rvt_qp *fqp;
1536
1537        lockdep_assert_held(&req->qp->s_lock);
1538        /* Exit if we have nothing in the flow circular buffer */
1539        if (!CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS))
1540                return -EINVAL;
1541
1542        spin_lock_irqsave(&rcd->exp_lock, flags);
1543
1544        for (i = 0; i < flow->tnode_cnt; i++)
1545                kern_unprogram_rcv_group(flow, i);
1546        /* To prevent double unprogramming */
1547        flow->tnode_cnt = 0;
1548        /* get head before dropping lock */
1549        fqp = first_qp(rcd, &rcd->rarr_queue);
1550        spin_unlock_irqrestore(&rcd->exp_lock, flags);
1551
1552        dma_unmap_flow(flow);
1553
1554        hfi1_tid_rdma_reset_flow(flow);
1555        req->clear_tail = (req->clear_tail + 1) & (MAX_FLOWS - 1);
1556
1557        if (fqp == req->qp) {
1558                __trigger_tid_waiter(fqp);
1559                rvt_put_qp(fqp);
1560        } else {
1561                tid_rdma_schedule_tid_wakeup(fqp);
1562        }
1563
1564        return 0;
1565}
1566
1567/*
1568 * This function is called to release all the tid entries for
1569 * a request.
1570 */
1571void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req)
1572        __must_hold(&req->qp->s_lock)
1573{
1574        /* Use memory barrier for proper ordering */
1575        while (CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS)) {
1576                if (hfi1_kern_exp_rcv_clear(req))
1577                        break;
1578        }
1579}
1580
1581/**
1582 * hfi1_kern_exp_rcv_free_flows - free priviously allocated flow information
1583 * @req - the tid rdma request to be cleaned
1584 */
1585static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req)
1586{
1587        kfree(req->flows);
1588        req->flows = NULL;
1589}
1590
1591/**
1592 * __trdma_clean_swqe - clean up for large sized QPs
1593 * @qp: the queue patch
1594 * @wqe: the send wqe
1595 */
1596void __trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
1597{
1598        struct hfi1_swqe_priv *p = wqe->priv;
1599
1600        hfi1_kern_exp_rcv_free_flows(&p->tid_req);
1601}
1602
1603/*
1604 * This can be called at QP create time or in the data path.
1605 */
1606static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req,
1607                                         gfp_t gfp)
1608{
1609        struct tid_rdma_flow *flows;
1610        int i;
1611
1612        if (likely(req->flows))
1613                return 0;
1614        flows = kmalloc_node(MAX_FLOWS * sizeof(*flows), gfp,
1615                             req->rcd->numa_id);
1616        if (!flows)
1617                return -ENOMEM;
1618        /* mini init */
1619        for (i = 0; i < MAX_FLOWS; i++) {
1620                flows[i].req = req;
1621                flows[i].npagesets = 0;
1622                flows[i].pagesets[0].mapped =  0;
1623                flows[i].resync_npkts = 0;
1624        }
1625        req->flows = flows;
1626        return 0;
1627}
1628
1629static void hfi1_init_trdma_req(struct rvt_qp *qp,
1630                                struct tid_rdma_request *req)
1631{
1632        struct hfi1_qp_priv *qpriv = qp->priv;
1633
1634        /*
1635         * Initialize various TID RDMA request variables.
1636         * These variables are "static", which is why they
1637         * can be pre-initialized here before the WRs has
1638         * even been submitted.
1639         * However, non-NULL values for these variables do not
1640         * imply that this WQE has been enabled for TID RDMA.
1641         * Drivers should check the WQE's opcode to determine
1642         * if a request is a TID RDMA one or not.
1643         */
1644        req->qp = qp;
1645        req->rcd = qpriv->rcd;
1646}
1647
1648u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry,
1649                            void *context, int vl, int mode, u64 data)
1650{
1651        struct hfi1_devdata *dd = context;
1652
1653        return dd->verbs_dev.n_tidwait;
1654}
1655
1656static struct tid_rdma_flow *find_flow_ib(struct tid_rdma_request *req,
1657                                          u32 psn, u16 *fidx)
1658{
1659        u16 head, tail;
1660        struct tid_rdma_flow *flow;
1661
1662        head = req->setup_head;
1663        tail = req->clear_tail;
1664        for ( ; CIRC_CNT(head, tail, MAX_FLOWS);
1665             tail = CIRC_NEXT(tail, MAX_FLOWS)) {
1666                flow = &req->flows[tail];
1667                if (cmp_psn(psn, flow->flow_state.ib_spsn) >= 0 &&
1668                    cmp_psn(psn, flow->flow_state.ib_lpsn) <= 0) {
1669                        if (fidx)
1670                                *fidx = tail;
1671                        return flow;
1672                }
1673        }
1674        return NULL;
1675}
1676
1677/* TID RDMA READ functions */
1678u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe,
1679                                    struct ib_other_headers *ohdr, u32 *bth1,
1680                                    u32 *bth2, u32 *len)
1681{
1682        struct tid_rdma_request *req = wqe_to_tid_req(wqe);
1683        struct tid_rdma_flow *flow = &req->flows[req->flow_idx];
1684        struct rvt_qp *qp = req->qp;
1685        struct hfi1_qp_priv *qpriv = qp->priv;
1686        struct hfi1_swqe_priv *wpriv = wqe->priv;
1687        struct tid_rdma_read_req *rreq = &ohdr->u.tid_rdma.r_req;
1688        struct tid_rdma_params *remote;
1689        u32 req_len = 0;
1690        void *req_addr = NULL;
1691
1692        /* This is the IB psn used to send the request */
1693        *bth2 = mask_psn(flow->flow_state.ib_spsn + flow->pkt);
1694        trace_hfi1_tid_flow_build_read_pkt(qp, req->flow_idx, flow);
1695
1696        /* TID Entries for TID RDMA READ payload */
1697        req_addr = &flow->tid_entry[flow->tid_idx];
1698        req_len = sizeof(*flow->tid_entry) *
1699                        (flow->tidcnt - flow->tid_idx);
1700
1701        memset(&ohdr->u.tid_rdma.r_req, 0, sizeof(ohdr->u.tid_rdma.r_req));
1702        wpriv->ss.sge.vaddr = req_addr;
1703        wpriv->ss.sge.sge_length = req_len;
1704        wpriv->ss.sge.length = wpriv->ss.sge.sge_length;
1705        /*
1706         * We can safely zero these out. Since the first SGE covers the
1707         * entire packet, nothing else should even look at the MR.
1708         */
1709        wpriv->ss.sge.mr = NULL;
1710        wpriv->ss.sge.m = 0;
1711        wpriv->ss.sge.n = 0;
1712
1713        wpriv->ss.sg_list = NULL;
1714        wpriv->ss.total_len = wpriv->ss.sge.sge_length;
1715        wpriv->ss.num_sge = 1;
1716
1717        /* Construct the TID RDMA READ REQ packet header */
1718        rcu_read_lock();
1719        remote = rcu_dereference(qpriv->tid_rdma.remote);
1720
1721        KDETH_RESET(rreq->kdeth0, KVER, 0x1);
1722        KDETH_RESET(rreq->kdeth1, JKEY, remote->jkey);
1723        rreq->reth.vaddr = cpu_to_be64(wqe->rdma_wr.remote_addr +
1724                           req->cur_seg * req->seg_len + flow->sent);
1725        rreq->reth.rkey = cpu_to_be32(wqe->rdma_wr.rkey);
1726        rreq->reth.length = cpu_to_be32(*len);
1727        rreq->tid_flow_psn =
1728                cpu_to_be32((flow->flow_state.generation <<
1729                             HFI1_KDETH_BTH_SEQ_SHIFT) |
1730                            ((flow->flow_state.spsn + flow->pkt) &
1731                             HFI1_KDETH_BTH_SEQ_MASK));
1732        rreq->tid_flow_qp =
1733                cpu_to_be32(qpriv->tid_rdma.local.qp |
1734                            ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) <<
1735                             TID_RDMA_DESTQP_FLOW_SHIFT) |
1736                            qpriv->rcd->ctxt);
1737        rreq->verbs_qp = cpu_to_be32(qp->remote_qpn);
1738        *bth1 &= ~RVT_QPN_MASK;
1739        *bth1 |= remote->qp;
1740        *bth2 |= IB_BTH_REQ_ACK;
1741        rcu_read_unlock();
1742
1743        /* We are done with this segment */
1744        flow->sent += *len;
1745        req->cur_seg++;
1746        qp->s_state = TID_OP(READ_REQ);
1747        req->ack_pending++;
1748        req->flow_idx = (req->flow_idx + 1) & (MAX_FLOWS - 1);
1749        qpriv->pending_tid_r_segs++;
1750        qp->s_num_rd_atomic++;
1751
1752        /* Set the TID RDMA READ request payload size */
1753        *len = req_len;
1754
1755        return sizeof(ohdr->u.tid_rdma.r_req) / sizeof(u32);
1756}
1757
1758/*
1759 * @len: contains the data length to read upon entry and the read request
1760 *       payload length upon exit.
1761 */
1762u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
1763                                 struct ib_other_headers *ohdr, u32 *bth1,
1764                                 u32 *bth2, u32 *len)
1765        __must_hold(&qp->s_lock)
1766{
1767        struct hfi1_qp_priv *qpriv = qp->priv;
1768        struct tid_rdma_request *req = wqe_to_tid_req(wqe);
1769        struct tid_rdma_flow *flow = NULL;
1770        u32 hdwords = 0;
1771        bool last;
1772        bool retry = true;
1773        u32 npkts = rvt_div_round_up_mtu(qp, *len);
1774
1775        trace_hfi1_tid_req_build_read_req(qp, 0, wqe->wr.opcode, wqe->psn,
1776                                          wqe->lpsn, req);
1777        /*
1778         * Check sync conditions. Make sure that there are no pending
1779         * segments before freeing the flow.
1780         */
1781sync_check:
1782        if (req->state == TID_REQUEST_SYNC) {
1783                if (qpriv->pending_tid_r_segs)
1784                        goto done;
1785
1786                hfi1_kern_clear_hw_flow(req->rcd, qp);
1787                qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
1788                req->state = TID_REQUEST_ACTIVE;
1789        }
1790
1791        /*
1792         * If the request for this segment is resent, the tid resources should
1793         * have been allocated before. In this case, req->flow_idx should
1794         * fall behind req->setup_head.
1795         */
1796        if (req->flow_idx == req->setup_head) {
1797                retry = false;
1798                if (req->state == TID_REQUEST_RESEND) {
1799                        /*
1800                         * This is the first new segment for a request whose
1801                         * earlier segments have been re-sent. We need to
1802                         * set up the sge pointer correctly.
1803                         */
1804                        restart_sge(&qp->s_sge, wqe, req->s_next_psn,
1805                                    qp->pmtu);
1806                        req->isge = 0;
1807                        req->state = TID_REQUEST_ACTIVE;
1808                }
1809
1810                /*
1811                 * Check sync. The last PSN of each generation is reserved for
1812                 * RESYNC.
1813                 */
1814                if ((qpriv->flow_state.psn + npkts) > MAX_TID_FLOW_PSN - 1) {
1815                        req->state = TID_REQUEST_SYNC;
1816                        goto sync_check;
1817                }
1818
1819                /* Allocate the flow if not yet */
1820                if (hfi1_kern_setup_hw_flow(qpriv->rcd, qp))
1821                        goto done;
1822
1823                /*
1824                 * The following call will advance req->setup_head after
1825                 * allocating the tid entries.
1826                 */
1827                if (hfi1_kern_exp_rcv_setup(req, &qp->s_sge, &last)) {
1828                        req->state = TID_REQUEST_QUEUED;
1829
1830                        /*
1831                         * We don't have resources for this segment. The QP has
1832                         * already been queued.
1833                         */
1834                        goto done;
1835                }
1836        }
1837
1838        /* req->flow_idx should only be one slot behind req->setup_head */
1839        flow = &req->flows[req->flow_idx];
1840        flow->pkt = 0;
1841        flow->tid_idx = 0;
1842        flow->sent = 0;
1843        if (!retry) {
1844                /* Set the first and last IB PSN for the flow in use.*/
1845                flow->flow_state.ib_spsn = req->s_next_psn;
1846                flow->flow_state.ib_lpsn =
1847                        flow->flow_state.ib_spsn + flow->npkts - 1;
1848        }
1849
1850        /* Calculate the next segment start psn.*/
1851        req->s_next_psn += flow->npkts;
1852
1853        /* Build the packet header */
1854        hdwords = hfi1_build_tid_rdma_read_packet(wqe, ohdr, bth1, bth2, len);
1855done:
1856        return hdwords;
1857}
1858
1859/*
1860 * Validate and accept the TID RDMA READ request parameters.
1861 * Return 0 if the request is accepted successfully;
1862 * Return 1 otherwise.
1863 */
1864static int tid_rdma_rcv_read_request(struct rvt_qp *qp,
1865                                     struct rvt_ack_entry *e,
1866                                     struct hfi1_packet *packet,
1867                                     struct ib_other_headers *ohdr,
1868                                     u32 bth0, u32 psn, u64 vaddr, u32 len)
1869{
1870        struct hfi1_qp_priv *qpriv = qp->priv;
1871        struct tid_rdma_request *req;
1872        struct tid_rdma_flow *flow;
1873        u32 flow_psn, i, tidlen = 0, pktlen, tlen;
1874
1875        req = ack_to_tid_req(e);
1876
1877        /* Validate the payload first */
1878        flow = &req->flows[req->setup_head];
1879
1880        /* payload length = packet length - (header length + ICRC length) */
1881        pktlen = packet->tlen - (packet->hlen + 4);
1882        if (pktlen > sizeof(flow->tid_entry))
1883                return 1;
1884        memcpy(flow->tid_entry, packet->ebuf, pktlen);
1885        flow->tidcnt = pktlen / sizeof(*flow->tid_entry);
1886
1887        /*
1888         * Walk the TID_ENTRY list to make sure we have enough space for a
1889         * complete segment. Also calculate the number of required packets.
1890         */
1891        flow->npkts = rvt_div_round_up_mtu(qp, len);
1892        for (i = 0; i < flow->tidcnt; i++) {
1893                trace_hfi1_tid_entry_rcv_read_req(qp, i,
1894                                                  flow->tid_entry[i]);
1895                tlen = EXP_TID_GET(flow->tid_entry[i], LEN);
1896                if (!tlen)
1897                        return 1;
1898
1899                /*
1900                 * For tid pair (tidctr == 3), the buffer size of the pair
1901                 * should be the sum of the buffer size described by each
1902                 * tid entry. However, only the first entry needs to be
1903                 * specified in the request (see WFR HAS Section 8.5.7.1).
1904                 */
1905                tidlen += tlen;
1906        }
1907        if (tidlen * PAGE_SIZE < len)
1908                return 1;
1909
1910        /* Empty the flow array */
1911        req->clear_tail = req->setup_head;
1912        flow->pkt = 0;
1913        flow->tid_idx = 0;
1914        flow->tid_offset = 0;
1915        flow->sent = 0;
1916        flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_qp);
1917        flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) &
1918                    TID_RDMA_DESTQP_FLOW_MASK;
1919        flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_psn));
1920        flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT;
1921        flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK;
1922        flow->length = len;
1923
1924        flow->flow_state.lpsn = flow->flow_state.spsn +
1925                flow->npkts - 1;
1926        flow->flow_state.ib_spsn = psn;
1927        flow->flow_state.ib_lpsn = flow->flow_state.ib_spsn + flow->npkts - 1;
1928
1929        trace_hfi1_tid_flow_rcv_read_req(qp, req->setup_head, flow);
1930        /* Set the initial flow index to the current flow. */
1931        req->flow_idx = req->setup_head;
1932
1933        /* advance circular buffer head */
1934        req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1);
1935
1936        /*
1937         * Compute last PSN for request.
1938         */
1939        e->opcode = (bth0 >> 24) & 0xff;
1940        e->psn = psn;
1941        e->lpsn = psn + flow->npkts - 1;
1942        e->sent = 0;
1943
1944        req->n_flows = qpriv->tid_rdma.local.max_read;
1945        req->state = TID_REQUEST_ACTIVE;
1946        req->cur_seg = 0;
1947        req->comp_seg = 0;
1948        req->ack_seg = 0;
1949        req->isge = 0;
1950        req->seg_len = qpriv->tid_rdma.local.max_len;
1951        req->total_len = len;
1952        req->total_segs = 1;
1953        req->r_flow_psn = e->psn;
1954
1955        trace_hfi1_tid_req_rcv_read_req(qp, 0, e->opcode, e->psn, e->lpsn,
1956                                        req);
1957        return 0;
1958}
1959
1960static int tid_rdma_rcv_error(struct hfi1_packet *packet,
1961                              struct ib_other_headers *ohdr,
1962                              struct rvt_qp *qp, u32 psn, int diff)
1963{
1964        struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
1965        struct hfi1_ctxtdata *rcd = ((struct hfi1_qp_priv *)qp->priv)->rcd;
1966        struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
1967        struct hfi1_qp_priv *qpriv = qp->priv;
1968        struct rvt_ack_entry *e;
1969        struct tid_rdma_request *req;
1970        unsigned long flags;
1971        u8 prev;
1972        bool old_req;
1973
1974        trace_hfi1_rsp_tid_rcv_error(qp, psn);
1975        trace_hfi1_tid_rdma_rcv_err(qp, 0, psn, diff);
1976        if (diff > 0) {
1977                /* sequence error */
1978                if (!qp->r_nak_state) {
1979                        ibp->rvp.n_rc_seqnak++;
1980                        qp->r_nak_state = IB_NAK_PSN_ERROR;
1981                        qp->r_ack_psn = qp->r_psn;
1982                        rc_defered_ack(rcd, qp);
1983                }
1984                goto done;
1985        }
1986
1987        ibp->rvp.n_rc_dupreq++;
1988
1989        spin_lock_irqsave(&qp->s_lock, flags);
1990        e = find_prev_entry(qp, psn, &prev, NULL, &old_req);
1991        if (!e || (e->opcode != TID_OP(READ_REQ) &&
1992                   e->opcode != TID_OP(WRITE_REQ)))
1993                goto unlock;
1994
1995        req = ack_to_tid_req(e);
1996        req->r_flow_psn = psn;
1997        trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn, e->lpsn, req);
1998        if (e->opcode == TID_OP(READ_REQ)) {
1999                struct ib_reth *reth;
2000                u32 len;

2001                u32 rkey;
2002                u64 vaddr;
2003                int ok;
2004                u32 bth0;
2005
2006                reth = &ohdr->u.tid_rdma.r_req.reth;
2007                /*
2008                 * The requester always restarts from the start of the original
2009                 * request.
2010                 */
2011                len = be32_to_cpu(reth->length);
2012                if (psn != e->psn || len != req->total_len)
2013                        goto unlock;
2014
2015                release_rdma_sge_mr(e);
2016
2017                rkey = be32_to_cpu(reth->rkey);
2018                vaddr = get_ib_reth_vaddr(reth);
2019
2020                qp->r_len = len;
2021                ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
2022                                 IB_ACCESS_REMOTE_READ);
2023                if (unlikely(!ok))
2024                        goto unlock;
2025
2026                /*
2027                 * If all the response packets for the current request have
2028                 * been sent out and this request is complete (old_request
2029                 * == false) and the TID flow may be unusable (the
2030                 * req->clear_tail is advanced). However, when an earlier
2031                 * request is received, this request will not be complete any
2032                 * more (qp->s_tail_ack_queue is moved back, see below).
2033                 * Consequently, we need to update the TID flow info everytime
2034                 * a duplicate request is received.
2035                 */
2036                bth0 = be32_to_cpu(ohdr->bth[0]);
2037                if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn,
2038                                              vaddr, len))
2039                        goto unlock;
2040
2041                /*
2042                 * True if the request is already scheduled (between
2043                 * qp->s_tail_ack_queue and qp->r_head_ack_queue);
2044                 */
2045                if (old_req)
2046                        goto unlock;
2047        } else {
2048                struct flow_state *fstate;
2049                bool schedule = false;
2050                u8 i;
2051
2052                if (req->state == TID_REQUEST_RESEND) {
2053                        req->state = TID_REQUEST_RESEND_ACTIVE;
2054                } else if (req->state == TID_REQUEST_INIT_RESEND) {
2055                        req->state = TID_REQUEST_INIT;
2056                        schedule = true;
2057                }
2058
2059                /*
2060                 * True if the request is already scheduled (between
2061                 * qp->s_tail_ack_queue and qp->r_head_ack_queue).
2062                 * Also, don't change requests, which are at the SYNC
2063                 * point and haven't generated any responses yet.
2064                 * There is nothing to retransmit for them yet.
2065                 */
2066                if (old_req || req->state == TID_REQUEST_INIT ||
2067                    (req->state == TID_REQUEST_SYNC && !req->cur_seg)) {
2068                        for (i = prev + 1; ; i++) {
2069                                if (i > rvt_size_atomic(&dev->rdi))
2070                                        i = 0;
2071                                if (i == qp->r_head_ack_queue)
2072                                        break;
2073                                e = &qp->s_ack_queue[i];
2074                                req = ack_to_tid_req(e);
2075                                if (e->opcode == TID_OP(WRITE_REQ) &&
2076                                    req->state == TID_REQUEST_INIT)
2077                                        req->state = TID_REQUEST_INIT_RESEND;
2078                        }
2079                        /*
2080                         * If the state of the request has been changed,
2081                         * the first leg needs to get scheduled in order to
2082                         * pick up the change. Otherwise, normal response
2083                         * processing should take care of it.
2084                         */
2085                        if (!schedule)
2086                                goto unlock;
2087                }
2088
2089                /*
2090                 * If there is no more allocated segment, just schedule the qp
2091                 * without changing any state.
2092                 */
2093                if (req->clear_tail == req->setup_head)
2094                        goto schedule;
2095                /*
2096                 * If this request has sent responses for segments, which have
2097                 * not received data yet (flow_idx != clear_tail), the flow_idx
2098                 * pointer needs to be adjusted so the same responses can be
2099                 * re-sent.
2100                 */
2101                if (CIRC_CNT(req->flow_idx, req->clear_tail, MAX_FLOWS)) {
2102                        fstate = &req->flows[req->clear_tail].flow_state;
2103                        qpriv->pending_tid_w_segs -=
2104                                CIRC_CNT(req->flow_idx, req->clear_tail,
2105                                         MAX_FLOWS);
2106                        req->flow_idx =
2107                                CIRC_ADD(req->clear_tail,
2108                                         delta_psn(psn, fstate->resp_ib_psn),
2109                                         MAX_FLOWS);
2110                        qpriv->pending_tid_w_segs +=
2111                                delta_psn(psn, fstate->resp_ib_psn);
2112                        /*
2113                         * When flow_idx == setup_head, we've gotten a duplicate
2114                         * request for a segment, which has not been allocated
2115                         * yet. In that case, don't adjust this request.
2116                         * However, we still want to go through the loop below
2117                         * to adjust all subsequent requests.
2118                         */
2119                        if (CIRC_CNT(req->setup_head, req->flow_idx,
2120                                     MAX_FLOWS)) {
2121                                req->cur_seg = delta_psn(psn, e->psn);
2122                                req->state = TID_REQUEST_RESEND_ACTIVE;
2123                        }
2124                }
2125
2126                for (i = prev + 1; ; i++) {
2127                        /*
2128                         * Look at everything up to and including
2129                         * s_tail_ack_queue
2130                         */
2131                        if (i > rvt_size_atomic(&dev->rdi))
2132                                i = 0;
2133                        if (i == qp->r_head_ack_queue)
2134                                break;
2135                        e = &qp->s_ack_queue[i];
2136                        req = ack_to_tid_req(e);
2137                        trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn,
2138                                                   e->lpsn, req);
2139                        if (e->opcode != TID_OP(WRITE_REQ) ||
2140                            req->cur_seg == req->comp_seg ||
2141                            req->state == TID_REQUEST_INIT ||
2142                            req->state == TID_REQUEST_INIT_RESEND) {
2143                                if (req->state == TID_REQUEST_INIT)
2144                                        req->state = TID_REQUEST_INIT_RESEND;
2145                                continue;
2146                        }
2147                        qpriv->pending_tid_w_segs -=
2148                                CIRC_CNT(req->flow_idx,
2149                                         req->clear_tail,
2150                                         MAX_FLOWS);
2151                        req->flow_idx = req->clear_tail;
2152                        req->state = TID_REQUEST_RESEND;
2153                        req->cur_seg = req->comp_seg;
2154                }
2155                qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK;
2156        }
2157        /* Re-process old requests.*/
2158        if (qp->s_acked_ack_queue == qp->s_tail_ack_queue)
2159                qp->s_acked_ack_queue = prev;
2160        qp->s_tail_ack_queue = prev;
2161        /*
2162         * Since the qp->s_tail_ack_queue is modified, the
2163         * qp->s_ack_state must be changed to re-initialize
2164         * qp->s_ack_rdma_sge; Otherwise, we will end up in
2165         * wrong memory region.
2166         */
2167        qp->s_ack_state = OP(ACKNOWLEDGE);
2168schedule:
2169        /*
2170         * It's possible to receive a retry psn that is earlier than an RNRNAK
2171         * psn. In this case, the rnrnak state should be cleared.
2172         */
2173        if (qpriv->rnr_nak_state) {
2174                qp->s_nak_state = 0;
2175                qpriv->rnr_nak_state = TID_RNR_NAK_INIT;
2176                qp->r_psn = e->lpsn + 1;
2177                hfi1_tid_write_alloc_resources(qp, true);
2178        }
2179
2180        qp->r_state = e->opcode;
2181        qp->r_nak_state = 0;
2182        qp->s_flags |= RVT_S_RESP_PENDING;
2183        hfi1_schedule_send(qp);
2184unlock:
2185        spin_unlock_irqrestore(&qp->s_lock, flags);
2186done:
2187        return 1;
2188}
2189
2190void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet)
2191{
2192        /* HANDLER FOR TID RDMA READ REQUEST packet (Responder side)*/
2193
2194        /*
2195         * 1. Verify TID RDMA READ REQ as per IB_OPCODE_RC_RDMA_READ
2196         *    (see hfi1_rc_rcv())
2197         * 2. Put TID RDMA READ REQ into the response queueu (s_ack_queue)
2198         *     - Setup struct tid_rdma_req with request info
2199         *     - Initialize struct tid_rdma_flow info;
2200         *     - Copy TID entries;
2201         * 3. Set the qp->s_ack_state.
2202         * 4. Set RVT_S_RESP_PENDING in s_flags.
2203         * 5. Kick the send engine (hfi1_schedule_send())
2204         */
2205        struct hfi1_ctxtdata *rcd = packet->rcd;
2206        struct rvt_qp *qp = packet->qp;
2207        struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
2208        struct ib_other_headers *ohdr = packet->ohdr;
2209        struct rvt_ack_entry *e;
2210        unsigned long flags;
2211        struct ib_reth *reth;
2212        struct hfi1_qp_priv *qpriv = qp->priv;
2213        u32 bth0, psn, len, rkey;
2214        bool fecn;
2215        u8 next;
2216        u64 vaddr;
2217        int diff;
2218        u8 nack_state = IB_NAK_INVALID_REQUEST;
2219
2220        bth0 = be32_to_cpu(ohdr->bth[0]);
2221        if (hfi1_ruc_check_hdr(ibp, packet))
2222                return;
2223
2224        fecn = process_ecn(qp, packet);
2225        psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
2226        trace_hfi1_rsp_rcv_tid_read_req(qp, psn);
2227
2228        if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
2229                rvt_comm_est(qp);
2230
2231        if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
2232                goto nack_inv;
2233
2234        reth = &ohdr->u.tid_rdma.r_req.reth;
2235        vaddr = be64_to_cpu(reth->vaddr);
2236        len = be32_to_cpu(reth->length);
2237        /* The length needs to be in multiples of PAGE_SIZE */
2238        if (!len || len & ~PAGE_MASK || len > qpriv->tid_rdma.local.max_len)
2239                goto nack_inv;
2240
2241        diff = delta_psn(psn, qp->r_psn);
2242        if (unlikely(diff)) {
2243                tid_rdma_rcv_err(packet, ohdr, qp, psn, diff, fecn);
2244                return;
2245        }
2246
2247        /* We've verified the request, insert it into the ack queue. */
2248        next = qp->r_head_ack_queue + 1;
2249        if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
2250                next = 0;
2251        spin_lock_irqsave(&qp->s_lock, flags);
2252        if (unlikely(next == qp->s_tail_ack_queue)) {
2253                if (!qp->s_ack_queue[next].sent) {
2254                        nack_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
2255                        goto nack_inv_unlock;
2256                }
2257                update_ack_queue(qp, next);
2258        }
2259        e = &qp->s_ack_queue[qp->r_head_ack_queue];
2260        release_rdma_sge_mr(e);
2261
2262        rkey = be32_to_cpu(reth->rkey);
2263        qp->r_len = len;
2264
2265        if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr,
2266                                  rkey, IB_ACCESS_REMOTE_READ)))
2267                goto nack_acc;
2268
2269        /* Accept the request parameters */
2270        if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn, vaddr,
2271                                      len))
2272                goto nack_inv_unlock;
2273
2274        qp->r_state = e->opcode;
2275        qp->r_nak_state = 0;
2276        /*
2277         * We need to increment the MSN here instead of when we
2278         * finish sending the result since a duplicate request would
2279         * increment it more than once.
2280         */
2281        qp->r_msn++;
2282        qp->r_psn += e->lpsn - e->psn + 1;
2283
2284        qp->r_head_ack_queue = next;
2285
2286        /*
2287         * For all requests other than TID WRITE which are added to the ack
2288         * queue, qpriv->r_tid_alloc follows qp->r_head_ack_queue. It is ok to
2289         * do this because of interlocks between these and TID WRITE
2290         * requests. The same change has also been made in hfi1_rc_rcv().
2291         */
2292        qpriv->r_tid_alloc = qp->r_head_ack_queue;
2293
2294        /* Schedule the send tasklet. */
2295        qp->s_flags |= RVT_S_RESP_PENDING;
2296        if (fecn)
2297                qp->s_flags |= RVT_S_ECN;
2298        hfi1_schedule_send(qp);
2299
2300        spin_unlock_irqrestore(&qp->s_lock, flags);
2301        return;
2302
2303nack_inv_unlock:
2304        spin_unlock_irqrestore(&qp->s_lock, flags);
2305nack_inv:
2306        rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
2307        qp->r_nak_state = nack_state;
2308        qp->r_ack_psn = qp->r_psn;
2309        /* Queue NAK for later */
2310        rc_defered_ack(rcd, qp);
2311        return;
2312nack_acc:
2313        spin_unlock_irqrestore(&qp->s_lock, flags);
2314        rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
2315        qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
2316        qp->r_ack_psn = qp->r_psn;
2317}
2318
2319u32 hfi1_build_tid_rdma_read_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
2320                                  struct ib_other_headers *ohdr, u32 *bth0,
2321                                  u32 *bth1, u32 *bth2, u32 *len, bool *last)
2322{
2323        struct hfi1_ack_priv *epriv = e->priv;
2324        struct tid_rdma_request *req = &epriv->tid_req;
2325        struct hfi1_qp_priv *qpriv = qp->priv;
2326        struct tid_rdma_flow *flow = &req->flows[req->clear_tail];
2327        u32 tidentry = flow->tid_entry[flow->tid_idx];
2328        u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT;
2329        struct tid_rdma_read_resp *resp = &ohdr->u.tid_rdma.r_rsp;
2330        u32 next_offset, om = KDETH_OM_LARGE;
2331        bool last_pkt;
2332        u32 hdwords = 0;
2333        struct tid_rdma_params *remote;
2334
2335        *len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset);
2336        flow->sent += *len;
2337        next_offset = flow->tid_offset + *len;
2338        last_pkt = (flow->sent >= flow->length);
2339
2340        trace_hfi1_tid_entry_build_read_resp(qp, flow->tid_idx, tidentry);
2341        trace_hfi1_tid_flow_build_read_resp(qp, req->clear_tail, flow);
2342
2343        rcu_read_lock();
2344        remote = rcu_dereference(qpriv->tid_rdma.remote);
2345        if (!remote) {
2346                rcu_read_unlock();
2347                goto done;
2348        }
2349        KDETH_RESET(resp->kdeth0, KVER, 0x1);
2350        KDETH_SET(resp->kdeth0, SH, !last_pkt);
2351        KDETH_SET(resp->kdeth0, INTR, !!(!last_pkt && remote->urg));
2352        KDETH_SET(resp->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL));
2353        KDETH_SET(resp->kdeth0, TID, EXP_TID_GET(tidentry, IDX));
2354        KDETH_SET(resp->kdeth0, OM, om == KDETH_OM_LARGE);
2355        KDETH_SET(resp->kdeth0, OFFSET, flow->tid_offset / om);
2356        KDETH_RESET(resp->kdeth1, JKEY, remote->jkey);
2357        resp->verbs_qp = cpu_to_be32(qp->remote_qpn);
2358        rcu_read_unlock();
2359
2360        resp->aeth = rvt_compute_aeth(qp);
2361        resp->verbs_psn = cpu_to_be32(mask_psn(flow->flow_state.ib_spsn +
2362                                               flow->pkt));
2363
2364        *bth0 = TID_OP(READ_RESP) << 24;
2365        *bth1 = flow->tid_qpn;
2366        *bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) &
2367                          HFI1_KDETH_BTH_SEQ_MASK) |
2368                         (flow->flow_state.generation <<
2369                          HFI1_KDETH_BTH_SEQ_SHIFT));
2370        *last = last_pkt;
2371        if (last_pkt)
2372                /* Advance to next flow */
2373                req->clear_tail = (req->clear_tail + 1) &
2374                                  (MAX_FLOWS - 1);
2375
2376        if (next_offset >= tidlen) {
2377                flow->tid_offset = 0;
2378                flow->tid_idx++;
2379        } else {
2380                flow->tid_offset = next_offset;
2381        }
2382
2383        hdwords = sizeof(ohdr->u.tid_rdma.r_rsp) / sizeof(u32);
2384
2385done:
2386        return hdwords;
2387}
2388
2389static inline struct tid_rdma_request *
2390find_tid_request(struct rvt_qp *qp, u32 psn, enum ib_wr_opcode opcode)
2391        __must_hold(&qp->s_lock)
2392{
2393        struct rvt_swqe *wqe;
2394        struct tid_rdma_request *req = NULL;
2395        u32 i, end;
2396
2397        end = qp->s_cur + 1;
2398        if (end == qp->s_size)
2399                end = 0;
2400        for (i = qp->s_acked; i != end;) {
2401                wqe = rvt_get_swqe_ptr(qp, i);
2402                if (cmp_psn(psn, wqe->psn) >= 0 &&
2403                    cmp_psn(psn, wqe->lpsn) <= 0) {
2404                        if (wqe->wr.opcode == opcode)
2405                                req = wqe_to_tid_req(wqe);
2406                        break;
2407                }
2408                if (++i == qp->s_size)
2409                        i = 0;
2410        }
2411
2412        return req;
2413}
2414
2415void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet)
2416{
2417        /* HANDLER FOR TID RDMA READ RESPONSE packet (Requestor side */
2418
2419        /*
2420         * 1. Find matching SWQE
2421         * 2. Check that the entire segment has been read.
2422         * 3. Remove HFI1_S_WAIT_TID_RESP from s_flags.
2423         * 4. Free the TID flow resources.
2424         * 5. Kick the send engine (hfi1_schedule_send())
2425         */
2426        struct ib_other_headers *ohdr = packet->ohdr;
2427        struct rvt_qp *qp = packet->qp;
2428        struct hfi1_qp_priv *priv = qp->priv;
2429        struct hfi1_ctxtdata *rcd = packet->rcd;
2430        struct tid_rdma_request *req;
2431        struct tid_rdma_flow *flow;
2432        u32 opcode, aeth;
2433        bool fecn;
2434        unsigned long flags;
2435        u32 kpsn, ipsn;
2436
2437        trace_hfi1_sender_rcv_tid_read_resp(qp);
2438        fecn = process_ecn(qp, packet);
2439        kpsn = mask_psn(be32_to_cpu(ohdr->bth[2]));
2440        aeth = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.aeth);
2441        opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
2442
2443        spin_lock_irqsave(&qp->s_lock, flags);
2444        ipsn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn));
2445        req = find_tid_request(qp, ipsn, IB_WR_TID_RDMA_READ);
2446        if (unlikely(!req))
2447                goto ack_op_err;
2448
2449        flow = &req->flows[req->clear_tail];
2450        /* When header suppression is disabled */
2451        if (cmp_psn(ipsn, flow->flow_state.ib_lpsn)) {
2452                update_r_next_psn_fecn(packet, priv, rcd, flow, fecn);
2453
2454                if (cmp_psn(kpsn, flow->flow_state.r_next_psn))
2455                        goto ack_done;
2456                flow->flow_state.r_next_psn = mask_psn(kpsn + 1);
2457                /*
2458                 * Copy the payload to destination buffer if this packet is
2459                 * delivered as an eager packet due to RSM rule and FECN.
2460                 * The RSM rule selects FECN bit in BTH and SH bit in
2461                 * KDETH header and therefore will not match the last
2462                 * packet of each segment that has SH bit cleared.
2463                 */
2464                if (fecn && packet->etype == RHF_RCV_TYPE_EAGER) {
2465                        struct rvt_sge_state ss;
2466                        u32 len;
2467                        u32 tlen = packet->tlen;
2468                        u16 hdrsize = packet->hlen;
2469                        u8 pad = packet->pad;
2470                        u8 extra_bytes = pad + packet->extra_byte +
2471                                (SIZE_OF_CRC << 2);
2472                        u32 pmtu = qp->pmtu;
2473
2474                        if (unlikely(tlen != (hdrsize + pmtu + extra_bytes)))
2475                                goto ack_op_err;
2476                        len = restart_sge(&ss, req->e.swqe, ipsn, pmtu);
2477                        if (unlikely(len < pmtu))
2478                                goto ack_op_err;
2479                        rvt_copy_sge(qp, &ss, packet->payload, pmtu, false,
2480                                     false);
2481                        /* Raise the sw sequence check flag for next packet */
2482                        priv->s_flags |= HFI1_R_TID_SW_PSN;
2483                }
2484
2485                goto ack_done;
2486        }
2487        flow->flow_state.r_next_psn = mask_psn(kpsn + 1);
2488        req->ack_pending--;
2489        priv->pending_tid_r_segs--;
2490        qp->s_num_rd_atomic--;
2491        if ((qp->s_flags & RVT_S_WAIT_FENCE) &&
2492            !qp->s_num_rd_atomic) {
2493                qp->s_flags &= ~(RVT_S_WAIT_FENCE |
2494                                 RVT_S_WAIT_ACK);
2495                hfi1_schedule_send(qp);
2496        }
2497        if (qp->s_flags & RVT_S_WAIT_RDMAR) {
2498                qp->s_flags &= ~(RVT_S_WAIT_RDMAR | RVT_S_WAIT_ACK);
2499                hfi1_schedule_send(qp);
2500        }
2501
2502        trace_hfi1_ack(qp, ipsn);
2503        trace_hfi1_tid_req_rcv_read_resp(qp, 0, req->e.swqe->wr.opcode,
2504                                         req->e.swqe->psn, req->e.swqe->lpsn,
2505                                         req);
2506        trace_hfi1_tid_flow_rcv_read_resp(qp, req->clear_tail, flow);
2507
2508        /* Release the tid resources */
2509        hfi1_kern_exp_rcv_clear(req);
2510
2511        if (!do_rc_ack(qp, aeth, ipsn, opcode, 0, rcd))
2512                goto ack_done;
2513
2514        /* If not done yet, build next read request */
2515        if (++req->comp_seg >= req->total_segs) {
2516                priv->tid_r_comp++;
2517                req->state = TID_REQUEST_COMPLETE;
2518        }
2519
2520        /*
2521         * Clear the hw flow under two conditions:
2522         * 1. This request is a sync point and it is complete;
2523         * 2. Current request is completed and there are no more requests.
2524         */
2525        if ((req->state == TID_REQUEST_SYNC &&
2526             req->comp_seg == req->cur_seg) ||
2527            priv->tid_r_comp == priv->tid_r_reqs) {
2528                hfi1_kern_clear_hw_flow(priv->rcd, qp);
2529                priv->s_flags &= ~HFI1_R_TID_SW_PSN;
2530                if (req->state == TID_REQUEST_SYNC)
2531                        req->state = TID_REQUEST_ACTIVE;
2532        }
2533
2534        hfi1_schedule_send(qp);
2535        goto ack_done;
2536
2537ack_op_err:
2538        /*
2539         * The test indicates that the send engine has finished its cleanup
2540         * after sending the request and it's now safe to put the QP into error
2541         * state. However, if the wqe queue is empty (qp->s_acked == qp->s_tail
2542         * == qp->s_head), it would be unsafe to complete the wqe pointed by
2543         * qp->s_acked here. Putting the qp into error state will safely flush
2544         * all remaining requests.
2545         */
2546        if (qp->s_last == qp->s_acked)
2547                rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
2548
2549ack_done:
2550        spin_unlock_irqrestore(&qp->s_lock, flags);
2551}
2552
2553void hfi1_kern_read_tid_flow_free(struct rvt_qp *qp)
2554        __must_hold(&qp->s_lock)
2555{
2556        u32 n = qp->s_acked;
2557        struct rvt_swqe *wqe;
2558        struct tid_rdma_request *req;
2559        struct hfi1_qp_priv *priv = qp->priv;
2560
2561        lockdep_assert_held(&qp->s_lock);
2562        /* Free any TID entries */
2563        while (n != qp->s_tail) {
2564                wqe = rvt_get_swqe_ptr(qp, n);
2565                if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
2566                        req = wqe_to_tid_req(wqe);
2567                        hfi1_kern_exp_rcv_clear_all(req);
2568                }
2569
2570                if (++n == qp->s_size)
2571                        n = 0;
2572        }
2573        /* Free flow */
2574        hfi1_kern_clear_hw_flow(priv->rcd, qp);
2575}
2576
2577static bool tid_rdma_tid_err(struct hfi1_packet *packet, u8 rcv_type)
2578{
2579        struct rvt_qp *qp = packet->qp;
2580
2581        if (rcv_type >= RHF_RCV_TYPE_IB)
2582                goto done;
2583
2584        spin_lock(&qp->s_lock);
2585
2586        /*
2587         * We've ran out of space in the eager buffer.
2588         * Eagerly received KDETH packets which require space in the
2589         * Eager buffer (packet that have payload) are TID RDMA WRITE
2590         * response packets. In this case, we have to re-transmit the
2591         * TID RDMA WRITE request.
2592         */
2593        if (rcv_type == RHF_RCV_TYPE_EAGER) {
2594                hfi1_restart_rc(qp, qp->s_last_psn + 1, 1);
2595                hfi1_schedule_send(qp);
2596        }
2597
2598        /* Since no payload is delivered, just drop the packet */
2599        spin_unlock(&qp->s_lock);
2600done:
2601        return true;
2602}
2603
2604static void restart_tid_rdma_read_req(struct hfi1_ctxtdata *rcd,
2605                                      struct rvt_qp *qp, struct rvt_swqe *wqe)
2606{
2607        struct tid_rdma_request *req;
2608        struct tid_rdma_flow *flow;
2609
2610        /* Start from the right segment */
2611        qp->r_flags |= RVT_R_RDMAR_SEQ;
2612        req = wqe_to_tid_req(wqe);
2613        flow = &req->flows[req->clear_tail];
2614        hfi1_restart_rc(qp, flow->flow_state.ib_spsn, 0);
2615        if (list_empty(&qp->rspwait)) {
2616                qp->r_flags |= RVT_R_RSP_SEND;
2617                rvt_get_qp(qp);
2618                list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
2619        }
2620}
2621
2622/*
2623 * Handle the KDETH eflags for TID RDMA READ response.
2624 *
2625 * Return true if the last packet for a segment has been received and it is
2626 * time to process the response normally; otherwise, return true.
2627 *
2628 * The caller must hold the packet->qp->r_lock and the rcu_read_lock.
2629 */
2630static bool handle_read_kdeth_eflags(struct hfi1_ctxtdata *rcd,
2631                                     struct hfi1_packet *packet, u8 rcv_type,
2632                                     u8 rte, u32 psn, u32 ibpsn)
2633        __must_hold(&packet->qp->r_lock) __must_hold(RCU)
2634{
2635        struct hfi1_pportdata *ppd = rcd->ppd;
2636        struct hfi1_devdata *dd = ppd->dd;
2637        struct hfi1_ibport *ibp;
2638        struct rvt_swqe *wqe;
2639        struct tid_rdma_request *req;
2640        struct tid_rdma_flow *flow;
2641        u32 ack_psn;
2642        struct rvt_qp *qp = packet->qp;
2643        struct hfi1_qp_priv *priv = qp->priv;
2644        bool ret = true;
2645        int diff = 0;
2646        u32 fpsn;
2647
2648        lockdep_assert_held(&qp->r_lock);
2649        spin_lock(&qp->s_lock);
2650        /* If the psn is out of valid range, drop the packet */
2651        if (cmp_psn(ibpsn, qp->s_last_psn) < 0 ||
2652            cmp_psn(ibpsn, qp->s_psn) > 0)
2653                goto s_unlock;
2654
2655        /*
2656         * Note that NAKs implicitly ACK outstanding SEND and RDMA write
2657         * requests and implicitly NAK RDMA read and atomic requests issued
2658         * before the NAK'ed request.
2659         */
2660        ack_psn = ibpsn - 1;
2661        wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
2662        ibp = to_iport(qp->ibqp.device, qp->port_num);
2663
2664        /* Complete WQEs that the PSN finishes. */
2665        while ((int)delta_psn(ack_psn, wqe->lpsn) >= 0) {
2666                /*
2667                 * If this request is a RDMA read or atomic, and the NACK is
2668                 * for a later operation, this NACK NAKs the RDMA read or
2669                 * atomic.
2670                 */
2671                if (wqe->wr.opcode == IB_WR_RDMA_READ ||
2672                    wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
2673                    wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
2674                    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
2675                        /* Retry this request. */
2676                        if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
2677                                qp->r_flags |= RVT_R_RDMAR_SEQ;
2678                                if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
2679                                        restart_tid_rdma_read_req(rcd, qp,
2680                                                                  wqe);
2681                                } else {
2682                                        hfi1_restart_rc(qp, qp->s_last_psn + 1,
2683                                                        0);
2684                                        if (list_empty(&qp->rspwait)) {
2685                                                qp->r_flags |= RVT_R_RSP_SEND;
2686                                                rvt_get_qp(qp);
2687                                                list_add_tail(/* wait */
2688                                                   &qp->rspwait,
2689                                                   &rcd->qp_wait_list);
2690                                        }
2691                                }
2692                        }
2693                        /*
2694                         * No need to process the NAK since we are
2695                         * restarting an earlier request.
2696                         */
2697                        break;
2698                }
2699
2700                wqe = do_rc_completion(qp, wqe, ibp);
2701                if (qp->s_acked == qp->s_tail)
2702                        goto s_unlock;
2703        }
2704
2705        if (qp->s_acked == qp->s_tail)
2706                goto s_unlock;
2707
2708        /* Handle the eflags for the request */
2709        if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
2710                goto s_unlock;
2711
2712        req = wqe_to_tid_req(wqe);
2713        switch (rcv_type) {
2714        case RHF_RCV_TYPE_EXPECTED:
2715                switch (rte) {
2716                case RHF_RTE_EXPECTED_FLOW_SEQ_ERR:
2717                        /*
2718                         * On the first occurrence of a Flow Sequence error,
2719                         * the flag TID_FLOW_SW_PSN is set.
2720                         *
2721                         * After that, the flow is *not* reprogrammed and the
2722                         * protocol falls back to SW PSN checking. This is done
2723                         * to prevent continuous Flow Sequence errors for any
2724                         * packets that could be still in the fabric.
2725                         */
2726                        flow = &req->flows[req->clear_tail];
2727                        if (priv->s_flags & HFI1_R_TID_SW_PSN) {
2728                                diff = cmp_psn(psn,
2729                                               flow->flow_state.r_next_psn);
2730                                if (diff > 0) {
2731                                        if (!(qp->r_flags & RVT_R_RDMAR_SEQ))
2732                                                restart_tid_rdma_read_req(rcd,
2733                                                                          qp,
2734                                                                          wqe);
2735
2736                                        /* Drop the packet.*/
2737                                        goto s_unlock;
2738                                } else if (diff < 0) {
2739                                        /*
2740                                         * If a response packet for a restarted
2741                                         * request has come back, reset the
2742                                         * restart flag.
2743                                         */
2744                                        if (qp->r_flags & RVT_R_RDMAR_SEQ)
2745                                                qp->r_flags &=
2746                                                        ~RVT_R_RDMAR_SEQ;
2747
2748                                        /* Drop the packet.*/
2749                                        goto s_unlock;
2750                                }
2751
2752                                /*
2753                                 * If SW PSN verification is successful and
2754                                 * this is the last packet in the segment, tell
2755                                 * the caller to process it as a normal packet.
2756                                 */
2757                                fpsn = full_flow_psn(flow,
2758                                                     flow->flow_state.lpsn);
2759                                if (cmp_psn(fpsn, psn) == 0) {
2760                                        ret = false;
2761                                        if (qp->r_flags & RVT_R_RDMAR_SEQ)
2762                                                qp->r_flags &=
2763                                                        ~RVT_R_RDMAR_SEQ;
2764                                }
2765                                flow->flow_state.r_next_psn =
2766                                        mask_psn(psn + 1);
2767                        } else {
2768                                u32 last_psn;
2769
2770                                last_psn = read_r_next_psn(dd, rcd->ctxt,
2771                                                           flow->idx);
2772                                flow->flow_state.r_next_psn = last_psn;
2773                                priv->s_flags |= HFI1_R_TID_SW_PSN;
2774                                /*
2775                                 * If no request has been restarted yet,
2776                                 * restart the current one.
2777                                 */
2778                                if (!(qp->r_flags & RVT_R_RDMAR_SEQ))
2779                                        restart_tid_rdma_read_req(rcd, qp,
2780                                                                  wqe);
2781                        }
2782
2783                        break;
2784
2785                case RHF_RTE_EXPECTED_FLOW_GEN_ERR:
2786                        /*
2787                         * Since the TID flow is able to ride through
2788                         * generation mismatch, drop this stale packet.
2789                         */
2790                        break;
2791
2792                default:
2793                        break;
2794                }
2795                break;
2796
2797        case RHF_RCV_TYPE_ERROR:
2798                switch (rte) {
2799                case RHF_RTE_ERROR_OP_CODE_ERR:
2800                case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR:
2801                case RHF_RTE_ERROR_KHDR_HCRC_ERR:
2802                case RHF_RTE_ERROR_KHDR_KVER_ERR:
2803                case RHF_RTE_ERROR_CONTEXT_ERR:
2804                case RHF_RTE_ERROR_KHDR_TID_ERR:
2805                default:
2806                        break;
2807                }
2808        default:
2809                break;
2810        }
2811s_unlock:
2812        spin_unlock(&qp->s_lock);
2813        return ret;
2814}
2815
2816bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
2817                              struct hfi1_pportdata *ppd,
2818                              struct hfi1_packet *packet)
2819{
2820        struct hfi1_ibport *ibp = &ppd->ibport_data;
2821        struct hfi1_devdata *dd = ppd->dd;
2822        struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
2823        u8 rcv_type = rhf_rcv_type(packet->rhf);
2824        u8 rte = rhf_rcv_type_err(packet->rhf);
2825        struct ib_header *hdr = packet->hdr;
2826        struct ib_other_headers *ohdr = NULL;
2827        int lnh = be16_to_cpu(hdr->lrh[0]) & 3;
2828        u16 lid  = be16_to_cpu(hdr->lrh[1]);
2829        u8 opcode;
2830        u32 qp_num, psn, ibpsn;
2831        struct rvt_qp *qp;
2832        struct hfi1_qp_priv *qpriv;
2833        unsigned long flags;
2834        bool ret = true;
2835        struct rvt_ack_entry *e;
2836        struct tid_rdma_request *req;
2837        struct tid_rdma_flow *flow;
2838        int diff = 0;
2839
2840        trace_hfi1_msg_handle_kdeth_eflags(NULL, "Kdeth error: rhf ",
2841                                           packet->rhf);
2842        if (packet->rhf & RHF_ICRC_ERR)
2843                return ret;
2844
2845        packet->ohdr = &hdr->u.oth;
2846        ohdr = packet->ohdr;
2847        trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf)));
2848
2849        /* Get the destination QP number. */
2850        qp_num = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_qp) &
2851                RVT_QPN_MASK;
2852        if (lid >= be16_to_cpu(IB_MULTICAST_LID_BASE))
2853                goto drop;
2854
2855        psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
2856        opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
2857
2858        rcu_read_lock();
2859        qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
2860        if (!qp)
2861                goto rcu_unlock;
2862
2863        packet->qp = qp;
2864
2865        /* Check for valid receive state. */
2866        spin_lock_irqsave(&qp->r_lock, flags);
2867        if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
2868                ibp->rvp.n_pkt_drops++;
2869                goto r_unlock;
2870        }
2871
2872        if (packet->rhf & RHF_TID_ERR) {
2873                /* For TIDERR and RC QPs preemptively schedule a NAK */
2874                u32 tlen = rhf_pkt_len(packet->rhf); /* in bytes */
2875
2876                /* Sanity check packet */
2877                if (tlen < 24)
2878                        goto r_unlock;
2879
2880                /*
2881                 * Check for GRH. We should never get packets with GRH in this
2882                 * path.
2883                 */
2884                if (lnh == HFI1_LRH_GRH)
2885                        goto r_unlock;
2886
2887                if (tid_rdma_tid_err(packet, rcv_type))
2888                        goto r_unlock;
2889        }
2890
2891        /* handle TID RDMA READ */
2892        if (opcode == TID_OP(READ_RESP)) {
2893                ibpsn = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn);
2894                ibpsn = mask_psn(ibpsn);
2895                ret = handle_read_kdeth_eflags(rcd, packet, rcv_type, rte, psn,
2896                                               ibpsn);
2897                goto r_unlock;
2898        }
2899
2900        /*
2901         * qp->s_tail_ack_queue points to the rvt_ack_entry currently being
2902         * processed. These a completed sequentially so we can be sure that
2903         * the pointer will not change until the entire request has completed.
2904         */
2905        spin_lock(&qp->s_lock);
2906        qpriv = qp->priv;
2907        if (qpriv->r_tid_tail == HFI1_QP_WQE_INVALID ||
2908            qpriv->r_tid_tail == qpriv->r_tid_head)
2909                goto unlock;
2910        e = &qp->s_ack_queue[qpriv->r_tid_tail];
2911        if (e->opcode != TID_OP(WRITE_REQ))
2912                goto unlock;
2913        req = ack_to_tid_req(e);
2914        if (req->comp_seg == req->cur_seg)
2915                goto unlock;
2916        flow = &req->flows[req->clear_tail];
2917        trace_hfi1_eflags_err_write(qp, rcv_type, rte, psn);
2918        trace_hfi1_rsp_handle_kdeth_eflags(qp, psn);
2919        trace_hfi1_tid_write_rsp_handle_kdeth_eflags(qp);
2920        trace_hfi1_tid_req_handle_kdeth_eflags(qp, 0, e->opcode, e->psn,
2921                                               e->lpsn, req);
2922        trace_hfi1_tid_flow_handle_kdeth_eflags(qp, req->clear_tail, flow);
2923
2924        switch (rcv_type) {
2925        case RHF_RCV_TYPE_EXPECTED:
2926                switch (rte) {
2927                case RHF_RTE_EXPECTED_FLOW_SEQ_ERR:
2928                        if (!(qpriv->s_flags & HFI1_R_TID_SW_PSN)) {
2929                                qpriv->s_flags |= HFI1_R_TID_SW_PSN;
2930                                flow->flow_state.r_next_psn =
2931                                        read_r_next_psn(dd, rcd->ctxt,
2932                                                        flow->idx);
2933                                qpriv->r_next_psn_kdeth =
2934                                        flow->flow_state.r_next_psn;
2935                                goto nak_psn;
2936                        } else {
2937                                /*
2938                                 * If the received PSN does not match the next
2939                                 * expected PSN, NAK the packet.
2940                                 * However, only do that if we know that the a
2941                                 * NAK has already been sent. Otherwise, this
2942                                 * mismatch could be due to packets that were
2943                                 * already in flight.
2944                                 */
2945                                diff = cmp_psn(psn,
2946                                               flow->flow_state.r_next_psn);
2947                                if (diff > 0)
2948                                        goto nak_psn;
2949                                else if (diff < 0)
2950                                        break;
2951
2952                                qpriv->s_nak_state = 0;
2953                                /*
2954                                 * If SW PSN verification is successful and this
2955                                 * is the last packet in the segment, tell the
2956                                 * caller to process it as a normal packet.
2957                                 */
2958                                if (psn == full_flow_psn(flow,
2959                                                         flow->flow_state.lpsn))
2960                                        ret = false;
2961                                flow->flow_state.r_next_psn =
2962                                        mask_psn(psn + 1);
2963                                qpriv->r_next_psn_kdeth =
2964                                        flow->flow_state.r_next_psn;
2965                        }
2966                        break;
2967
2968                case RHF_RTE_EXPECTED_FLOW_GEN_ERR:
2969                        goto nak_psn;
2970
2971                default:
2972                        break;
2973                }
2974                break;
2975
2976        case RHF_RCV_TYPE_ERROR:
2977                switch (rte) {
2978                case RHF_RTE_ERROR_OP_CODE_ERR:
2979                case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR:
2980                case RHF_RTE_ERROR_KHDR_HCRC_ERR:
2981                case RHF_RTE_ERROR_KHDR_KVER_ERR:
2982                case RHF_RTE_ERROR_CONTEXT_ERR:
2983                case RHF_RTE_ERROR_KHDR_TID_ERR:
2984                default:
2985                        break;
2986                }
2987        default:
2988                break;
2989        }
2990
2991unlock:
2992        spin_unlock(&qp->s_lock);
2993r_unlock:
2994        spin_unlock_irqrestore(&qp->r_lock, flags);
2995rcu_unlock:
2996        rcu_read_unlock();
2997drop:
2998        return ret;
2999nak_psn:
3000        ibp->rvp.n_rc_seqnak++;

3001        if (!qpriv->s_nak_state) {
3002                qpriv->s_nak_state = IB_NAK_PSN_ERROR;
3003                /* We are NAK'ing the next expected PSN */
3004                qpriv->s_nak_psn = mask_psn(flow->flow_state.r_next_psn);
3005                qpriv->s_flags |= RVT_S_ACK_PENDING;
3006                if (qpriv->r_tid_ack == HFI1_QP_WQE_INVALID)
3007                        qpriv->r_tid_ack = qpriv->r_tid_tail;
3008                hfi1_schedule_tid_send(qp);
3009        }
3010        goto unlock;
3011}
3012
3013/*
3014 * "Rewind" the TID request information.
3015 * This means that we reset the state back to ACTIVE,
3016 * find the proper flow, set the flow index to that flow,
3017 * and reset the flow information.
3018 */
3019void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
3020                               u32 *bth2)
3021{
3022        struct tid_rdma_request *req = wqe_to_tid_req(wqe);
3023        struct tid_rdma_flow *flow;
3024        struct hfi1_qp_priv *qpriv = qp->priv;
3025        int diff, delta_pkts;
3026        u32 tididx = 0, i;
3027        u16 fidx;
3028
3029        if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
3030                *bth2 = mask_psn(qp->s_psn);
3031                flow = find_flow_ib(req, *bth2, &fidx);
3032                if (!flow) {
3033                        trace_hfi1_msg_tid_restart_req(/* msg */
3034                           qp, "!!!!!! Could not find flow to restart: bth2 ",
3035                           (u64)*bth2);
3036                        trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode,
3037                                                       wqe->psn, wqe->lpsn,
3038                                                       req);
3039                        return;
3040                }
3041        } else {
3042                fidx = req->acked_tail;
3043                flow = &req->flows[fidx];
3044                *bth2 = mask_psn(req->r_ack_psn);
3045        }
3046
3047        if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
3048                delta_pkts = delta_psn(*bth2, flow->flow_state.ib_spsn);
3049        else
3050                delta_pkts = delta_psn(*bth2,
3051                                       full_flow_psn(flow,
3052                                                     flow->flow_state.spsn));
3053
3054        trace_hfi1_tid_flow_restart_req(qp, fidx, flow);
3055        diff = delta_pkts + flow->resync_npkts;
3056
3057        flow->sent = 0;
3058        flow->pkt = 0;
3059        flow->tid_idx = 0;
3060        flow->tid_offset = 0;
3061        if (diff) {
3062                for (tididx = 0; tididx < flow->tidcnt; tididx++) {
3063                        u32 tidentry = flow->tid_entry[tididx], tidlen,
3064                                tidnpkts, npkts;
3065
3066                        flow->tid_offset = 0;
3067                        tidlen = EXP_TID_GET(tidentry, LEN) * PAGE_SIZE;
3068                        tidnpkts = rvt_div_round_up_mtu(qp, tidlen);
3069                        npkts = min_t(u32, diff, tidnpkts);
3070                        flow->pkt += npkts;
3071                        flow->sent += (npkts == tidnpkts ? tidlen :
3072                                       npkts * qp->pmtu);
3073                        flow->tid_offset += npkts * qp->pmtu;
3074                        diff -= npkts;
3075                        if (!diff)
3076                                break;
3077                }
3078        }
3079        if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) {
3080                rvt_skip_sge(&qpriv->tid_ss, (req->cur_seg * req->seg_len) +
3081                             flow->sent, 0);
3082                /*
3083                 * Packet PSN is based on flow_state.spsn + flow->pkt. However,
3084                 * during a RESYNC, the generation is incremented and the
3085                 * sequence is reset to 0. Since we've adjusted the npkts in the
3086                 * flow and the SGE has been sufficiently advanced, we have to
3087                 * adjust flow->pkt in order to calculate the correct PSN.
3088                 */
3089                flow->pkt -= flow->resync_npkts;
3090        }
3091
3092        if (flow->tid_offset ==
3093            EXP_TID_GET(flow->tid_entry[tididx], LEN) * PAGE_SIZE) {
3094                tididx++;
3095                flow->tid_offset = 0;
3096        }
3097        flow->tid_idx = tididx;
3098        if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
3099                /* Move flow_idx to correct index */
3100                req->flow_idx = fidx;
3101        else
3102                req->clear_tail = fidx;
3103
3104        trace_hfi1_tid_flow_restart_req(qp, fidx, flow);
3105        trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode, wqe->psn,
3106                                       wqe->lpsn, req);
3107        req->state = TID_REQUEST_ACTIVE;
3108        if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) {
3109                /* Reset all the flows that we are going to resend */
3110                fidx = CIRC_NEXT(fidx, MAX_FLOWS);
3111                i = qpriv->s_tid_tail;
3112                do {
3113                        for (; CIRC_CNT(req->setup_head, fidx, MAX_FLOWS);
3114                              fidx = CIRC_NEXT(fidx, MAX_FLOWS)) {
3115                                req->flows[fidx].sent = 0;
3116                                req->flows[fidx].pkt = 0;
3117                                req->flows[fidx].tid_idx = 0;
3118                                req->flows[fidx].tid_offset = 0;
3119                                req->flows[fidx].resync_npkts = 0;
3120                        }
3121                        if (i == qpriv->s_tid_cur)
3122                                break;
3123                        do {
3124                                i = (++i == qp->s_size ? 0 : i);
3125                                wqe = rvt_get_swqe_ptr(qp, i);
3126                        } while (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE);
3127                        req = wqe_to_tid_req(wqe);
3128                        req->cur_seg = req->ack_seg;
3129                        fidx = req->acked_tail;
3130                        /* Pull req->clear_tail back */
3131                        req->clear_tail = fidx;
3132                } while (1);
3133        }
3134}
3135
3136void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp)
3137{
3138        int i, ret;
3139        struct hfi1_qp_priv *qpriv = qp->priv;
3140        struct tid_flow_state *fs;
3141
3142        if (qp->ibqp.qp_type != IB_QPT_RC || !HFI1_CAP_IS_KSET(TID_RDMA))
3143                return;
3144
3145        /*
3146         * First, clear the flow to help prevent any delayed packets from
3147         * being delivered.
3148         */
3149        fs = &qpriv->flow_state;
3150        if (fs->index != RXE_NUM_TID_FLOWS)
3151                hfi1_kern_clear_hw_flow(qpriv->rcd, qp);
3152
3153        for (i = qp->s_acked; i != qp->s_head;) {
3154                struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i);
3155
3156                if (++i == qp->s_size)
3157                        i = 0;
3158                /* Free only locally allocated TID entries */
3159                if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
3160                        continue;
3161                do {
3162                        struct hfi1_swqe_priv *priv = wqe->priv;
3163
3164                        ret = hfi1_kern_exp_rcv_clear(&priv->tid_req);
3165                } while (!ret);
3166        }
3167        for (i = qp->s_acked_ack_queue; i != qp->r_head_ack_queue;) {
3168                struct rvt_ack_entry *e = &qp->s_ack_queue[i];
3169
3170                if (++i == rvt_max_atomic(ib_to_rvt(qp->ibqp.device)))
3171                        i = 0;
3172                /* Free only locally allocated TID entries */
3173                if (e->opcode != TID_OP(WRITE_REQ))
3174                        continue;
3175                do {
3176                        struct hfi1_ack_priv *priv = e->priv;
3177
3178                        ret = hfi1_kern_exp_rcv_clear(&priv->tid_req);
3179                } while (!ret);
3180        }
3181}
3182
3183bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe)
3184{
3185        struct rvt_swqe *prev;
3186        struct hfi1_qp_priv *priv = qp->priv;
3187        u32 s_prev;
3188        struct tid_rdma_request *req;
3189
3190        s_prev = (qp->s_cur == 0 ? qp->s_size : qp->s_cur) - 1;
3191        prev = rvt_get_swqe_ptr(qp, s_prev);
3192
3193        switch (wqe->wr.opcode) {
3194        case IB_WR_SEND:
3195        case IB_WR_SEND_WITH_IMM:
3196        case IB_WR_SEND_WITH_INV:
3197        case IB_WR_ATOMIC_CMP_AND_SWP:
3198        case IB_WR_ATOMIC_FETCH_AND_ADD:
3199        case IB_WR_RDMA_WRITE:
3200                switch (prev->wr.opcode) {
3201                case IB_WR_TID_RDMA_WRITE:
3202                        req = wqe_to_tid_req(prev);
3203                        if (req->ack_seg != req->total_segs)
3204                                goto interlock;
3205                default:
3206                        break;
3207                }
3208                break;
3209        case IB_WR_RDMA_READ:
3210                if (prev->wr.opcode != IB_WR_TID_RDMA_WRITE)
3211                        break;
3212                /* fall through */
3213        case IB_WR_TID_RDMA_READ:
3214                switch (prev->wr.opcode) {
3215                case IB_WR_RDMA_READ:
3216                        if (qp->s_acked != qp->s_cur)
3217                                goto interlock;
3218                        break;
3219                case IB_WR_TID_RDMA_WRITE:
3220                        req = wqe_to_tid_req(prev);
3221                        if (req->ack_seg != req->total_segs)
3222                                goto interlock;
3223                default:
3224                        break;
3225                }
3226        default:
3227                break;
3228        }
3229        return false;
3230
3231interlock:
3232        priv->s_flags |= HFI1_S_TID_WAIT_INTERLCK;
3233        return true;
3234}
3235
3236/* Does @sge meet the alignment requirements for tid rdma? */
3237static inline bool hfi1_check_sge_align(struct rvt_qp *qp,
3238                                        struct rvt_sge *sge, int num_sge)
3239{
3240        int i;
3241
3242        for (i = 0; i < num_sge; i++, sge++) {
3243                trace_hfi1_sge_check_align(qp, i, sge);
3244                if ((u64)sge->vaddr & ~PAGE_MASK ||
3245                    sge->sge_length & ~PAGE_MASK)
3246                        return false;
3247        }
3248        return true;
3249}
3250
3251void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
3252{
3253        struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv;
3254        struct hfi1_swqe_priv *priv = wqe->priv;
3255        struct tid_rdma_params *remote;
3256        enum ib_wr_opcode new_opcode;
3257        bool do_tid_rdma = false;
3258        struct hfi1_pportdata *ppd = qpriv->rcd->ppd;
3259
3260        if ((rdma_ah_get_dlid(&qp->remote_ah_attr) & ~((1 << ppd->lmc) - 1)) ==
3261                                ppd->lid)
3262                return;
3263        if (qpriv->hdr_type != HFI1_PKT_TYPE_9B)
3264                return;
3265
3266        rcu_read_lock();
3267        remote = rcu_dereference(qpriv->tid_rdma.remote);
3268        /*
3269         * If TID RDMA is disabled by the negotiation, don't
3270         * use it.
3271         */
3272        if (!remote)
3273                goto exit;
3274
3275        if (wqe->wr.opcode == IB_WR_RDMA_READ) {
3276                if (hfi1_check_sge_align(qp, &wqe->sg_list[0],
3277                                         wqe->wr.num_sge)) {
3278                        new_opcode = IB_WR_TID_RDMA_READ;
3279                        do_tid_rdma = true;
3280                }
3281        } else if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
3282                /*
3283                 * TID RDMA is enabled for this RDMA WRITE request iff:
3284                 *   1. The remote address is page-aligned,
3285                 *   2. The length is larger than the minimum segment size,
3286                 *   3. The length is page-multiple.
3287                 */
3288                if (!(wqe->rdma_wr.remote_addr & ~PAGE_MASK) &&
3289                    !(wqe->length & ~PAGE_MASK)) {
3290                        new_opcode = IB_WR_TID_RDMA_WRITE;
3291                        do_tid_rdma = true;
3292                }
3293        }
3294
3295        if (do_tid_rdma) {
3296                if (hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req, GFP_ATOMIC))
3297                        goto exit;
3298                wqe->wr.opcode = new_opcode;
3299                priv->tid_req.seg_len =
3300                        min_t(u32, remote->max_len, wqe->length);
3301                priv->tid_req.total_segs =
3302                        DIV_ROUND_UP(wqe->length, priv->tid_req.seg_len);
3303                /* Compute the last PSN of the request */
3304                wqe->lpsn = wqe->psn;
3305                if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
3306                        priv->tid_req.n_flows = remote->max_read;
3307                        qpriv->tid_r_reqs++;
3308                        wqe->lpsn += rvt_div_round_up_mtu(qp, wqe->length) - 1;
3309                } else {
3310                        wqe->lpsn += priv->tid_req.total_segs - 1;
3311                        atomic_inc(&qpriv->n_requests);
3312                }
3313
3314                priv->tid_req.cur_seg = 0;
3315                priv->tid_req.comp_seg = 0;
3316                priv->tid_req.ack_seg = 0;
3317                priv->tid_req.state = TID_REQUEST_INACTIVE;
3318                /*
3319                 * Reset acked_tail.
3320                 * TID RDMA READ does not have ACKs so it does not
3321                 * update the pointer. We have to reset it so TID RDMA
3322                 * WRITE does not get confused.
3323                 */
3324                priv->tid_req.acked_tail = priv->tid_req.setup_head;
3325                trace_hfi1_tid_req_setup_tid_wqe(qp, 1, wqe->wr.opcode,
3326                                                 wqe->psn, wqe->lpsn,
3327                                                 &priv->tid_req);
3328        }
3329exit:
3330        rcu_read_unlock();
3331}
3332
3333/* TID RDMA WRITE functions */
3334
3335u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
3336                                  struct ib_other_headers *ohdr,
3337                                  u32 *bth1, u32 *bth2, u32 *len)
3338{
3339        struct hfi1_qp_priv *qpriv = qp->priv;
3340        struct tid_rdma_request *req = wqe_to_tid_req(wqe);
3341        struct tid_rdma_params *remote;
3342
3343        rcu_read_lock();
3344        remote = rcu_dereference(qpriv->tid_rdma.remote);
3345        /*
3346         * Set the number of flow to be used based on negotiated
3347         * parameters.
3348         */
3349        req->n_flows = remote->max_write;
3350        req->state = TID_REQUEST_ACTIVE;
3351
3352        KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth0, KVER, 0x1);
3353        KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth1, JKEY, remote->jkey);
3354        ohdr->u.tid_rdma.w_req.reth.vaddr =
3355                cpu_to_be64(wqe->rdma_wr.remote_addr + (wqe->length - *len));
3356        ohdr->u.tid_rdma.w_req.reth.rkey =
3357                cpu_to_be32(wqe->rdma_wr.rkey);
3358        ohdr->u.tid_rdma.w_req.reth.length = cpu_to_be32(*len);
3359        ohdr->u.tid_rdma.w_req.verbs_qp = cpu_to_be32(qp->remote_qpn);
3360        *bth1 &= ~RVT_QPN_MASK;
3361        *bth1 |= remote->qp;
3362        qp->s_state = TID_OP(WRITE_REQ);
3363        qp->s_flags |= HFI1_S_WAIT_TID_RESP;
3364        *bth2 |= IB_BTH_REQ_ACK;
3365        *len = 0;
3366
3367        rcu_read_unlock();
3368        return sizeof(ohdr->u.tid_rdma.w_req) / sizeof(u32);
3369}
3370
3371void hfi1_compute_tid_rdma_flow_wt(void)
3372{
3373        /*
3374         * Heuristic for computing the RNR timeout when waiting on the flow
3375         * queue. Rather than a computationaly expensive exact estimate of when
3376         * a flow will be available, we assume that if a QP is at position N in
3377         * the flow queue it has to wait approximately (N + 1) * (number of
3378         * segments between two sync points), assuming PMTU of 4K. The rationale
3379         * for this is that flows are released and recycled at each sync point.
3380         */
3381        tid_rdma_flow_wt = MAX_TID_FLOW_PSN * enum_to_mtu(OPA_MTU_4096) /
3382                TID_RDMA_MAX_SEGMENT_SIZE;
3383}
3384
3385static u32 position_in_queue(struct hfi1_qp_priv *qpriv,
3386                             struct tid_queue *queue)
3387{
3388        return qpriv->tid_enqueue - queue->dequeue;
3389}
3390
3391/*
3392 * @qp: points to rvt_qp context.
3393 * @to_seg: desired RNR timeout in segments.
3394 * Return: index of the next highest timeout in the ib_hfi1_rnr_table[]
3395 */
3396static u32 hfi1_compute_tid_rnr_timeout(struct rvt_qp *qp, u32 to_seg)
3397{
3398        struct hfi1_qp_priv *qpriv = qp->priv;
3399        u64 timeout;
3400        u32 bytes_per_us;
3401        u8 i;
3402
3403        bytes_per_us = active_egress_rate(qpriv->rcd->ppd) / 8;
3404        timeout = (to_seg * TID_RDMA_MAX_SEGMENT_SIZE) / bytes_per_us;
3405        /*
3406         * Find the next highest value in the RNR table to the required
3407         * timeout. This gives the responder some padding.
3408         */
3409        for (i = 1; i <= IB_AETH_CREDIT_MASK; i++)
3410                if (rvt_rnr_tbl_to_usec(i) >= timeout)
3411                        return i;
3412        return 0;
3413}
3414
3415/**
3416 * Central place for resource allocation at TID write responder,
3417 * is called from write_req and write_data interrupt handlers as
3418 * well as the send thread when a queued QP is scheduled for
3419 * resource allocation.
3420 *
3421 * Iterates over (a) segments of a request and then (b) queued requests
3422 * themselves to allocate resources for up to local->max_write
3423 * segments across multiple requests. Stop allocating when we
3424 * hit a sync point, resume allocating after data packets at
3425 * sync point have been received.
3426 *
3427 * Resource allocation and sending of responses is decoupled. The
3428 * request/segment which are being allocated and sent are as follows.
3429 * Resources are allocated for:
3430 *     [request: qpriv->r_tid_alloc, segment: req->alloc_seg]
3431 * The send thread sends:
3432 *     [request: qp->s_tail_ack_queue, segment:req->cur_seg]
3433 */
3434static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx)
3435{
3436        struct tid_rdma_request *req;
3437        struct hfi1_qp_priv *qpriv = qp->priv;
3438        struct hfi1_ctxtdata *rcd = qpriv->rcd;
3439        struct tid_rdma_params *local = &qpriv->tid_rdma.local;
3440        struct rvt_ack_entry *e;
3441        u32 npkts, to_seg;
3442        bool last;
3443        int ret = 0;
3444
3445        lockdep_assert_held(&qp->s_lock);
3446
3447        while (1) {
3448                trace_hfi1_rsp_tid_write_alloc_res(qp, 0);
3449                trace_hfi1_tid_write_rsp_alloc_res(qp);
3450                /*
3451                 * Don't allocate more segments if a RNR NAK has already been
3452                 * scheduled to avoid messing up qp->r_psn: the RNR NAK will
3453                 * be sent only when all allocated segments have been sent.
3454                 * However, if more segments are allocated before that, TID RDMA
3455                 * WRITE RESP packets will be sent out for these new segments
3456                 * before the RNR NAK packet. When the requester receives the
3457                 * RNR NAK packet, it will restart with qp->s_last_psn + 1,
3458                 * which does not match qp->r_psn and will be dropped.
3459                 * Consequently, the requester will exhaust its retries and
3460                 * put the qp into error state.
3461                 */
3462                if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND)
3463                        break;
3464
3465                /* No requests left to process */
3466                if (qpriv->r_tid_alloc == qpriv->r_tid_head) {
3467                        /* If all data has been received, clear the flow */
3468                        if (qpriv->flow_state.index < RXE_NUM_TID_FLOWS &&
3469                            !qpriv->alloc_w_segs) {
3470                                hfi1_kern_clear_hw_flow(rcd, qp);
3471                                qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
3472                        }
3473                        break;
3474                }
3475
3476                e = &qp->s_ack_queue[qpriv->r_tid_alloc];
3477                if (e->opcode != TID_OP(WRITE_REQ))
3478                        goto next_req;
3479                req = ack_to_tid_req(e);
3480                trace_hfi1_tid_req_write_alloc_res(qp, 0, e->opcode, e->psn,
3481                                                   e->lpsn, req);
3482                /* Finished allocating for all segments of this request */
3483                if (req->alloc_seg >= req->total_segs)
3484                        goto next_req;
3485
3486                /* Can allocate only a maximum of local->max_write for a QP */
3487                if (qpriv->alloc_w_segs >= local->max_write)
3488                        break;
3489
3490                /* Don't allocate at a sync point with data packets pending */
3491                if (qpriv->sync_pt && qpriv->alloc_w_segs)
3492                        break;
3493
3494                /* All data received at the sync point, continue */
3495                if (qpriv->sync_pt && !qpriv->alloc_w_segs) {
3496                        hfi1_kern_clear_hw_flow(rcd, qp);
3497                        qpriv->sync_pt = false;
3498                        qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
3499                }
3500
3501                /* Allocate flow if we don't have one */
3502                if (qpriv->flow_state.index >= RXE_NUM_TID_FLOWS) {
3503                        ret = hfi1_kern_setup_hw_flow(qpriv->rcd, qp);
3504                        if (ret) {
3505                                to_seg = tid_rdma_flow_wt *
3506                                        position_in_queue(qpriv,
3507                                                          &rcd->flow_queue);
3508                                break;
3509                        }
3510                }
3511
3512                npkts = rvt_div_round_up_mtu(qp, req->seg_len);
3513
3514                /*
3515                 * We are at a sync point if we run out of KDETH PSN space.
3516                 * Last PSN of every generation is reserved for RESYNC.
3517                 */
3518                if (qpriv->flow_state.psn + npkts > MAX_TID_FLOW_PSN - 1) {
3519                        qpriv->sync_pt = true;
3520                        break;
3521                }
3522
3523                /*
3524                 * If overtaking req->acked_tail, send an RNR NAK. Because the
3525                 * QP is not queued in this case, and the issue can only be
3526                 * caused due a delay in scheduling the second leg which we
3527                 * cannot estimate, we use a rather arbitrary RNR timeout of
3528                 * (MAX_FLOWS / 2) segments
3529                 */
3530                if (!CIRC_SPACE(req->setup_head, req->acked_tail,
3531                                MAX_FLOWS)) {
3532                        ret = -EAGAIN;
3533                        to_seg = MAX_FLOWS >> 1;
3534                        qpriv->s_flags |= RVT_S_ACK_PENDING;
3535                        hfi1_schedule_tid_send(qp);
3536                        break;
3537                }
3538
3539                /* Try to allocate rcv array / TID entries */
3540                ret = hfi1_kern_exp_rcv_setup(req, &req->ss, &last);
3541                if (ret == -EAGAIN)
3542                        to_seg = position_in_queue(qpriv, &rcd->rarr_queue);
3543                if (ret)
3544                        break;
3545
3546                qpriv->alloc_w_segs++;
3547                req->alloc_seg++;
3548                continue;
3549next_req:
3550                /* Begin processing the next request */
3551                if (++qpriv->r_tid_alloc >
3552                    rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
3553                        qpriv->r_tid_alloc = 0;
3554        }
3555
3556        /*
3557         * Schedule an RNR NAK to be sent if (a) flow or rcv array allocation
3558         * has failed (b) we are called from the rcv handler interrupt context
3559         * (c) an RNR NAK has not already been scheduled
3560         */
3561        if (ret == -EAGAIN && intr_ctx && !qp->r_nak_state)
3562                goto send_rnr_nak;
3563
3564        return;
3565
3566send_rnr_nak:
3567        lockdep_assert_held(&qp->r_lock);
3568
3569        /* Set r_nak_state to prevent unrelated events from generating NAK's */
3570        qp->r_nak_state = hfi1_compute_tid_rnr_timeout(qp, to_seg) | IB_RNR_NAK;
3571
3572        /* Pull back r_psn to the segment being RNR NAK'd */
3573        qp->r_psn = e->psn + req->alloc_seg;
3574        qp->r_ack_psn = qp->r_psn;
3575        /*
3576         * Pull back r_head_ack_queue to the ack entry following the request
3577         * being RNR NAK'd. This allows resources to be allocated to the request
3578         * if the queued QP is scheduled.
3579         */
3580        qp->r_head_ack_queue = qpriv->r_tid_alloc + 1;
3581        if (qp->r_head_ack_queue > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
3582                qp->r_head_ack_queue = 0;
3583        qpriv->r_tid_head = qp->r_head_ack_queue;
3584        /*
3585         * These send side fields are used in make_rc_ack(). They are set in
3586         * hfi1_send_rc_ack() but must be set here before dropping qp->s_lock
3587         * for consistency
3588         */
3589        qp->s_nak_state = qp->r_nak_state;
3590        qp->s_ack_psn = qp->r_ack_psn;
3591        /*
3592         * Clear the ACK PENDING flag to prevent unwanted ACK because we
3593         * have modified qp->s_ack_psn here.
3594         */
3595        qp->s_flags &= ~(RVT_S_ACK_PENDING);
3596
3597        trace_hfi1_rsp_tid_write_alloc_res(qp, qp->r_psn);
3598        /*
3599         * qpriv->rnr_nak_state is used to determine when the scheduled RNR NAK
3600         * has actually been sent. qp->s_flags RVT_S_ACK_PENDING bit cannot be
3601         * used for this because qp->s_lock is dropped before calling
3602         * hfi1_send_rc_ack() leading to inconsistency between the receive
3603         * interrupt handlers and the send thread in make_rc_ack()
3604         */
3605        qpriv->rnr_nak_state = TID_RNR_NAK_SEND;
3606
3607        /*
3608         * Schedule RNR NAK to be sent. RNR NAK's are scheduled from the receive
3609         * interrupt handlers but will be sent from the send engine behind any
3610         * previous responses that may have been scheduled
3611         */
3612        rc_defered_ack(rcd, qp);
3613}
3614
3615void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet)
3616{
3617        /* HANDLER FOR TID RDMA WRITE REQUEST packet (Responder side)*/
3618
3619        /*
3620         * 1. Verify TID RDMA WRITE REQ as per IB_OPCODE_RC_RDMA_WRITE_FIRST
3621         *    (see hfi1_rc_rcv())
3622         *     - Don't allow 0-length requests.
3623         * 2. Put TID RDMA WRITE REQ into the response queueu (s_ack_queue)
3624         *     - Setup struct tid_rdma_req with request info
3625         *     - Prepare struct tid_rdma_flow array?
3626         * 3. Set the qp->s_ack_state as state diagram in design doc.
3627         * 4. Set RVT_S_RESP_PENDING in s_flags.
3628         * 5. Kick the send engine (hfi1_schedule_send())
3629         */
3630        struct hfi1_ctxtdata *rcd = packet->rcd;
3631        struct rvt_qp *qp = packet->qp;
3632        struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
3633        struct ib_other_headers *ohdr = packet->ohdr;
3634        struct rvt_ack_entry *e;
3635        unsigned long flags;
3636        struct ib_reth *reth;
3637        struct hfi1_qp_priv *qpriv = qp->priv;
3638        struct tid_rdma_request *req;
3639        u32 bth0, psn, len, rkey, num_segs;
3640        bool fecn;
3641        u8 next;
3642        u64 vaddr;
3643        int diff;
3644
3645        bth0 = be32_to_cpu(ohdr->bth[0]);
3646        if (hfi1_ruc_check_hdr(ibp, packet))
3647                return;
3648
3649        fecn = process_ecn(qp, packet);
3650        psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
3651        trace_hfi1_rsp_rcv_tid_write_req(qp, psn);
3652
3653        if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
3654                rvt_comm_est(qp);
3655
3656        if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
3657                goto nack_inv;
3658
3659        reth = &ohdr->u.tid_rdma.w_req.reth;
3660        vaddr = be64_to_cpu(reth->vaddr);
3661        len = be32_to_cpu(reth->length);
3662
3663        num_segs = DIV_ROUND_UP(len, qpriv->tid_rdma.local.max_len);
3664        diff = delta_psn(psn, qp->r_psn);
3665        if (unlikely(diff)) {
3666                tid_rdma_rcv_err(packet, ohdr, qp, psn, diff, fecn);
3667                return;
3668        }
3669
3670        /*
3671         * The resent request which was previously RNR NAK'd is inserted at the
3672         * location of the original request, which is one entry behind
3673         * r_head_ack_queue
3674         */
3675        if (qpriv->rnr_nak_state)
3676                qp->r_head_ack_queue = qp->r_head_ack_queue ?
3677                        qp->r_head_ack_queue - 1 :
3678                        rvt_size_atomic(ib_to_rvt(qp->ibqp.device));
3679
3680        /* We've verified the request, insert it into the ack queue. */
3681        next = qp->r_head_ack_queue + 1;
3682        if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
3683                next = 0;
3684        spin_lock_irqsave(&qp->s_lock, flags);
3685        if (unlikely(next == qp->s_acked_ack_queue)) {
3686                if (!qp->s_ack_queue[next].sent)
3687                        goto nack_inv_unlock;
3688                update_ack_queue(qp, next);
3689        }
3690        e = &qp->s_ack_queue[qp->r_head_ack_queue];
3691        req = ack_to_tid_req(e);
3692
3693        /* Bring previously RNR NAK'd request back to life */
3694        if (qpriv->rnr_nak_state) {
3695                qp->r_nak_state = 0;
3696                qp->s_nak_state = 0;
3697                qpriv->rnr_nak_state = TID_RNR_NAK_INIT;
3698                qp->r_psn = e->lpsn + 1;
3699                req->state = TID_REQUEST_INIT;
3700                goto update_head;
3701        }
3702
3703        release_rdma_sge_mr(e);
3704
3705        /* The length needs to be in multiples of PAGE_SIZE */
3706        if (!len || len & ~PAGE_MASK)
3707                goto nack_inv_unlock;
3708
3709        rkey = be32_to_cpu(reth->rkey);
3710        qp->r_len = len;
3711
3712        if (e->opcode == TID_OP(WRITE_REQ) &&
3713            (req->setup_head != req->clear_tail ||
3714             req->clear_tail != req->acked_tail))
3715                goto nack_inv_unlock;
3716
3717        if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr,
3718                                  rkey, IB_ACCESS_REMOTE_WRITE)))
3719                goto nack_acc;
3720
3721        qp->r_psn += num_segs - 1;
3722
3723        e->opcode = (bth0 >> 24) & 0xff;
3724        e->psn = psn;
3725        e->lpsn = qp->r_psn;
3726        e->sent = 0;
3727
3728        req->n_flows = min_t(u16, num_segs, qpriv->tid_rdma.local.max_write);
3729        req->state = TID_REQUEST_INIT;
3730        req->cur_seg = 0;
3731        req->comp_seg = 0;
3732        req->ack_seg = 0;
3733        req->alloc_seg = 0;
3734        req->isge = 0;
3735        req->seg_len = qpriv->tid_rdma.local.max_len;
3736        req->total_len = len;
3737        req->total_segs = num_segs;
3738        req->r_flow_psn = e->psn;
3739        req->ss.sge = e->rdma_sge;
3740        req->ss.num_sge = 1;
3741
3742        req->flow_idx = req->setup_head;
3743        req->clear_tail = req->setup_head;
3744        req->acked_tail = req->setup_head;
3745
3746        qp->r_state = e->opcode;
3747        qp->r_nak_state = 0;
3748        /*
3749         * We need to increment the MSN here instead of when we
3750         * finish sending the result since a duplicate request would
3751         * increment it more than once.
3752         */
3753        qp->r_msn++;
3754        qp->r_psn++;
3755
3756        trace_hfi1_tid_req_rcv_write_req(qp, 0, e->opcode, e->psn, e->lpsn,
3757                                         req);
3758
3759        if (qpriv->r_tid_tail == HFI1_QP_WQE_INVALID) {
3760                qpriv->r_tid_tail = qp->r_head_ack_queue;
3761        } else if (qpriv->r_tid_tail == qpriv->r_tid_head) {
3762                struct tid_rdma_request *ptr;
3763
3764                e = &qp->s_ack_queue[qpriv->r_tid_tail];
3765                ptr = ack_to_tid_req(e);
3766
3767                if (e->opcode != TID_OP(WRITE_REQ) ||
3768                    ptr->comp_seg == ptr->total_segs) {
3769                        if (qpriv->r_tid_tail == qpriv->r_tid_ack)
3770                                qpriv->r_tid_ack = qp->r_head_ack_queue;
3771                        qpriv->r_tid_tail = qp->r_head_ack_queue;
3772                }
3773        }
3774update_head:
3775        qp->r_head_ack_queue = next;
3776        qpriv->r_tid_head = qp->r_head_ack_queue;
3777
3778        hfi1_tid_write_alloc_resources(qp, true);
3779        trace_hfi1_tid_write_rsp_rcv_req(qp);
3780
3781        /* Schedule the send tasklet. */
3782        qp->s_flags |= RVT_S_RESP_PENDING;
3783        if (fecn)
3784                qp->s_flags |= RVT_S_ECN;
3785        hfi1_schedule_send(qp);
3786
3787        spin_unlock_irqrestore(&qp->s_lock, flags);
3788        return;
3789
3790nack_inv_unlock:
3791        spin_unlock_irqrestore(&qp->s_lock, flags);
3792nack_inv:
3793        rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
3794        qp->r_nak_state = IB_NAK_INVALID_REQUEST;
3795        qp->r_ack_psn = qp->r_psn;
3796        /* Queue NAK for later */
3797        rc_defered_ack(rcd, qp);
3798        return;
3799nack_acc:
3800        spin_unlock_irqrestore(&qp->s_lock, flags);
3801        rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
3802        qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
3803        qp->r_ack_psn = qp->r_psn;
3804}
3805
3806u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
3807                                   struct ib_other_headers *ohdr, u32 *bth1,
3808                                   u32 bth2, u32 *len,
3809                                   struct rvt_sge_state **ss)
3810{
3811        struct hfi1_ack_priv *epriv = e->priv;
3812        struct tid_rdma_request *req = &epriv->tid_req;
3813        struct hfi1_qp_priv *qpriv = qp->priv;
3814        struct tid_rdma_flow *flow = NULL;
3815        u32 resp_len = 0, hdwords = 0;
3816        void *resp_addr = NULL;
3817        struct tid_rdma_params *remote;
3818
3819        trace_hfi1_tid_req_build_write_resp(qp, 0, e->opcode, e->psn, e->lpsn,
3820                                            req);
3821        trace_hfi1_tid_write_rsp_build_resp(qp);
3822        trace_hfi1_rsp_build_tid_write_resp(qp, bth2);
3823        flow = &req->flows[req->flow_idx];
3824        switch (req->state) {
3825        default:
3826                /*
3827                 * Try to allocate resources here in case QP was queued and was
3828                 * later scheduled when resources became available
3829                 */
3830                hfi1_tid_write_alloc_resources(qp, false);
3831
3832                /* We've already sent everything which is ready */
3833                if (req->cur_seg >= req->alloc_seg)
3834                        goto done;
3835
3836                /*
3837                 * Resources can be assigned but responses cannot be sent in
3838                 * rnr_nak state, till the resent request is received
3839                 */
3840                if (qpriv->rnr_nak_state == TID_RNR_NAK_SENT)
3841                        goto done;
3842
3843                req->state = TID_REQUEST_ACTIVE;
3844                trace_hfi1_tid_flow_build_write_resp(qp, req->flow_idx, flow);
3845                req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS);
3846                hfi1_add_tid_reap_timer(qp);
3847                break;
3848
3849        case TID_REQUEST_RESEND_ACTIVE:
3850        case TID_REQUEST_RESEND:
3851                trace_hfi1_tid_flow_build_write_resp(qp, req->flow_idx, flow);
3852                req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS);
3853                if (!CIRC_CNT(req->setup_head, req->flow_idx, MAX_FLOWS))
3854                        req->state = TID_REQUEST_ACTIVE;
3855
3856                hfi1_mod_tid_reap_timer(qp);
3857                break;
3858        }
3859        flow->flow_state.resp_ib_psn = bth2;
3860        resp_addr = (void *)flow->tid_entry;
3861        resp_len = sizeof(*flow->tid_entry) * flow->tidcnt;
3862        req->cur_seg++;
3863
3864        memset(&ohdr->u.tid_rdma.w_rsp, 0, sizeof(ohdr->u.tid_rdma.w_rsp));
3865        epriv->ss.sge.vaddr = resp_addr;
3866        epriv->ss.sge.sge_length = resp_len;
3867        epriv->ss.sge.length = epriv->ss.sge.sge_length;
3868        /*
3869         * We can safely zero these out. Since the first SGE covers the
3870         * entire packet, nothing else should even look at the MR.
3871         */
3872        epriv->ss.sge.mr = NULL;
3873        epriv->ss.sge.m = 0;
3874        epriv->ss.sge.n = 0;
3875
3876        epriv->ss.sg_list = NULL;
3877        epriv->ss.total_len = epriv->ss.sge.sge_length;
3878        epriv->ss.num_sge = 1;
3879
3880        *ss = &epriv->ss;
3881        *len = epriv->ss.total_len;
3882
3883        /* Construct the TID RDMA WRITE RESP packet header */
3884        rcu_read_lock();
3885        remote = rcu_dereference(qpriv->tid_rdma.remote);
3886
3887        KDETH_RESET(ohdr->u.tid_rdma.w_rsp.kdeth0, KVER, 0x1);
3888        KDETH_RESET(ohdr->u.tid_rdma.w_rsp.kdeth1, JKEY, remote->jkey);
3889        ohdr->u.tid_rdma.w_rsp.aeth = rvt_compute_aeth(qp);
3890        ohdr->u.tid_rdma.w_rsp.tid_flow_psn =
3891                cpu_to_be32((flow->flow_state.generation <<
3892                             HFI1_KDETH_BTH_SEQ_SHIFT) |
3893                            (flow->flow_state.spsn &
3894                             HFI1_KDETH_BTH_SEQ_MASK));
3895        ohdr->u.tid_rdma.w_rsp.tid_flow_qp =
3896                cpu_to_be32(qpriv->tid_rdma.local.qp |
3897                            ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) <<
3898                             TID_RDMA_DESTQP_FLOW_SHIFT) |
3899                            qpriv->rcd->ctxt);
3900        ohdr->u.tid_rdma.w_rsp.verbs_qp = cpu_to_be32(qp->remote_qpn);
3901        *bth1 = remote->qp;
3902        rcu_read_unlock();
3903        hdwords = sizeof(ohdr->u.tid_rdma.w_rsp) / sizeof(u32);
3904        qpriv->pending_tid_w_segs++;
3905done:
3906        return hdwords;
3907}
3908
3909static void hfi1_add_tid_reap_timer(struct rvt_qp *qp)
3910{
3911        struct hfi1_qp_priv *qpriv = qp->priv;
3912
3913        lockdep_assert_held(&qp->s_lock);
3914        if (!(qpriv->s_flags & HFI1_R_TID_RSC_TIMER)) {
3915                qpriv->s_flags |= HFI1_R_TID_RSC_TIMER;
3916                qpriv->s_tid_timer.expires = jiffies +
3917                        qpriv->tid_timer_timeout_jiffies;
3918                add_timer(&qpriv->s_tid_timer);
3919        }
3920}
3921
3922static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp)
3923{
3924        struct hfi1_qp_priv *qpriv = qp->priv;
3925
3926        lockdep_assert_held(&qp->s_lock);
3927        qpriv->s_flags |= HFI1_R_TID_RSC_TIMER;
3928        mod_timer(&qpriv->s_tid_timer, jiffies +
3929                  qpriv->tid_timer_timeout_jiffies);
3930}
3931
3932static int hfi1_stop_tid_reap_timer(struct rvt_qp *qp)
3933{
3934        struct hfi1_qp_priv *qpriv = qp->priv;
3935        int rval = 0;
3936
3937        lockdep_assert_held(&qp->s_lock);
3938        if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) {
3939                rval = del_timer(&qpriv->s_tid_timer);
3940                qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER;
3941        }
3942        return rval;
3943}
3944
3945void hfi1_del_tid_reap_timer(struct rvt_qp *qp)
3946{
3947        struct hfi1_qp_priv *qpriv = qp->priv;
3948
3949        del_timer_sync(&qpriv->s_tid_timer);
3950        qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER;
3951}
3952
3953static void hfi1_tid_timeout(struct timer_list *t)
3954{
3955        struct hfi1_qp_priv *qpriv = from_timer(qpriv, t, s_tid_timer);
3956        struct rvt_qp *qp = qpriv->owner;
3957        struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
3958        unsigned long flags;
3959        u32 i;
3960
3961        spin_lock_irqsave(&qp->r_lock, flags);
3962        spin_lock(&qp->s_lock);
3963        if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) {
3964                dd_dev_warn(dd_from_ibdev(qp->ibqp.device), "[QP%u] %s %d\n",
3965                            qp->ibqp.qp_num, __func__, __LINE__);
3966                trace_hfi1_msg_tid_timeout(/* msg */
3967                        qp, "resource timeout = ",
3968                        (u64)qpriv->tid_timer_timeout_jiffies);
3969                hfi1_stop_tid_reap_timer(qp);
3970                /*
3971                 * Go though the entire ack queue and clear any outstanding
3972                 * HW flow and RcvArray resources.
3973                 */
3974                hfi1_kern_clear_hw_flow(qpriv->rcd, qp);
3975                for (i = 0; i < rvt_max_atomic(rdi); i++) {
3976                        struct tid_rdma_request *req =
3977                                ack_to_tid_req(&qp->s_ack_queue[i]);
3978
3979                        hfi1_kern_exp_rcv_clear_all(req);
3980                }
3981                spin_unlock(&qp->s_lock);
3982                if (qp->ibqp.event_handler) {
3983                        struct ib_event ev;
3984
3985                        ev.device = qp->ibqp.device;
3986                        ev.element.qp = &qp->ibqp;
3987                        ev.event = IB_EVENT_QP_FATAL;
3988                        qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
3989                }
3990                rvt_rc_error(qp, IB_WC_RESP_TIMEOUT_ERR);
3991                goto unlock_r_lock;
3992        }
3993        spin_unlock(&qp->s_lock);
3994unlock_r_lock:
3995        spin_unlock_irqrestore(&qp->r_lock, flags);
3996}
3997
3998void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet)
3999{
4000        /* HANDLER FOR TID RDMA WRITE RESPONSE packet (Requestor side */

4001
4002        /*
4003         * 1. Find matching SWQE
4004         * 2. Check that TIDENTRY array has enough space for a complete
4005         *    segment. If not, put QP in error state.
4006         * 3. Save response data in struct tid_rdma_req and struct tid_rdma_flow
4007         * 4. Remove HFI1_S_WAIT_TID_RESP from s_flags.
4008         * 5. Set qp->s_state
4009         * 6. Kick the send engine (hfi1_schedule_send())
4010         */
4011        struct ib_other_headers *ohdr = packet->ohdr;
4012        struct rvt_qp *qp = packet->qp;
4013        struct hfi1_qp_priv *qpriv = qp->priv;
4014        struct hfi1_ctxtdata *rcd = packet->rcd;
4015        struct rvt_swqe *wqe;
4016        struct tid_rdma_request *req;
4017        struct tid_rdma_flow *flow;
4018        enum ib_wc_status status;
4019        u32 opcode, aeth, psn, flow_psn, i, tidlen = 0, pktlen;
4020        bool fecn;
4021        unsigned long flags;
4022
4023        fecn = process_ecn(qp, packet);
4024        psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
4025        aeth = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.aeth);
4026        opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
4027
4028        spin_lock_irqsave(&qp->s_lock, flags);
4029
4030        /* Ignore invalid responses */
4031        if (cmp_psn(psn, qp->s_next_psn) >= 0)
4032                goto ack_done;
4033
4034        /* Ignore duplicate responses. */
4035        if (unlikely(cmp_psn(psn, qp->s_last_psn) <= 0))
4036                goto ack_done;
4037
4038        if (unlikely(qp->s_acked == qp->s_tail))
4039                goto ack_done;
4040
4041        /*
4042         * If we are waiting for a particular packet sequence number
4043         * due to a request being resent, check for it. Otherwise,
4044         * ensure that we haven't missed anything.
4045         */
4046        if (qp->r_flags & RVT_R_RDMAR_SEQ) {
4047                if (cmp_psn(psn, qp->s_last_psn + 1) != 0)
4048                        goto ack_done;
4049                qp->r_flags &= ~RVT_R_RDMAR_SEQ;
4050        }
4051
4052        wqe = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur);
4053        if (unlikely(wqe->wr.opcode != IB_WR_TID_RDMA_WRITE))
4054                goto ack_op_err;
4055
4056        req = wqe_to_tid_req(wqe);
4057        /*
4058         * If we've lost ACKs and our acked_tail pointer is too far
4059         * behind, don't overwrite segments. Just drop the packet and
4060         * let the reliability protocol take care of it.
4061         */
4062        if (!CIRC_SPACE(req->setup_head, req->acked_tail, MAX_FLOWS))
4063                goto ack_done;
4064
4065        /*
4066         * The call to do_rc_ack() should be last in the chain of
4067         * packet checks because it will end up updating the QP state.
4068         * Therefore, anything that would prevent the packet from
4069         * being accepted as a successful response should be prior
4070         * to it.
4071         */
4072        if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd))
4073                goto ack_done;
4074
4075        trace_hfi1_ack(qp, psn);
4076
4077        flow = &req->flows[req->setup_head];
4078        flow->pkt = 0;
4079        flow->tid_idx = 0;
4080        flow->tid_offset = 0;
4081        flow->sent = 0;
4082        flow->resync_npkts = 0;
4083        flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.tid_flow_qp);
4084        flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) &
4085                TID_RDMA_DESTQP_FLOW_MASK;
4086        flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.w_rsp.tid_flow_psn));
4087        flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT;
4088        flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK;
4089        flow->flow_state.resp_ib_psn = psn;
4090        flow->length = min_t(u32, req->seg_len,
4091                             (wqe->length - (req->comp_seg * req->seg_len)));
4092
4093        flow->npkts = rvt_div_round_up_mtu(qp, flow->length);
4094        flow->flow_state.lpsn = flow->flow_state.spsn +
4095                flow->npkts - 1;
4096        /* payload length = packet length - (header length + ICRC length) */
4097        pktlen = packet->tlen - (packet->hlen + 4);
4098        if (pktlen > sizeof(flow->tid_entry)) {
4099                status = IB_WC_LOC_LEN_ERR;
4100                goto ack_err;
4101        }
4102        memcpy(flow->tid_entry, packet->ebuf, pktlen);
4103        flow->tidcnt = pktlen / sizeof(*flow->tid_entry);
4104        trace_hfi1_tid_flow_rcv_write_resp(qp, req->setup_head, flow);
4105
4106        req->comp_seg++;
4107        trace_hfi1_tid_write_sender_rcv_resp(qp, 0);
4108        /*
4109         * Walk the TID_ENTRY list to make sure we have enough space for a
4110         * complete segment.
4111         */
4112        for (i = 0; i < flow->tidcnt; i++) {
4113                trace_hfi1_tid_entry_rcv_write_resp(/* entry */
4114                        qp, i, flow->tid_entry[i]);
4115                if (!EXP_TID_GET(flow->tid_entry[i], LEN)) {
4116                        status = IB_WC_LOC_LEN_ERR;
4117                        goto ack_err;
4118                }
4119                tidlen += EXP_TID_GET(flow->tid_entry[i], LEN);
4120        }
4121        if (tidlen * PAGE_SIZE < flow->length) {
4122                status = IB_WC_LOC_LEN_ERR;
4123                goto ack_err;
4124        }
4125
4126        trace_hfi1_tid_req_rcv_write_resp(qp, 0, wqe->wr.opcode, wqe->psn,
4127                                          wqe->lpsn, req);
4128        /*
4129         * If this is the first response for this request, set the initial
4130         * flow index to the current flow.
4131         */
4132        if (!cmp_psn(psn, wqe->psn)) {
4133                req->r_last_acked = mask_psn(wqe->psn - 1);
4134                /* Set acked flow index to head index */
4135                req->acked_tail = req->setup_head;
4136        }
4137
4138        /* advance circular buffer head */
4139        req->setup_head = CIRC_NEXT(req->setup_head, MAX_FLOWS);
4140        req->state = TID_REQUEST_ACTIVE;
4141
4142        /*
4143         * If all responses for this TID RDMA WRITE request have been received
4144         * advance the pointer to the next one.
4145         * Since TID RDMA requests could be mixed in with regular IB requests,
4146         * they might not appear sequentially in the queue. Therefore, the
4147         * next request needs to be "found".
4148         */
4149        if (qpriv->s_tid_cur != qpriv->s_tid_head &&
4150            req->comp_seg == req->total_segs) {
4151                for (i = qpriv->s_tid_cur + 1; ; i++) {
4152                        if (i == qp->s_size)
4153                                i = 0;
4154                        wqe = rvt_get_swqe_ptr(qp, i);
4155                        if (i == qpriv->s_tid_head)
4156                                break;
4157                        if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
4158                                break;
4159                }
4160                qpriv->s_tid_cur = i;
4161        }
4162        qp->s_flags &= ~HFI1_S_WAIT_TID_RESP;
4163        hfi1_schedule_tid_send(qp);
4164        goto ack_done;
4165
4166ack_op_err:
4167        status = IB_WC_LOC_QP_OP_ERR;
4168ack_err:
4169        rvt_error_qp(qp, status);
4170ack_done:
4171        if (fecn)
4172                qp->s_flags |= RVT_S_ECN;
4173        spin_unlock_irqrestore(&qp->s_lock, flags);
4174}
4175
4176bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe,
4177                                struct ib_other_headers *ohdr,
4178                                u32 *bth1, u32 *bth2, u32 *len)
4179{
4180        struct tid_rdma_request *req = wqe_to_tid_req(wqe);
4181        struct tid_rdma_flow *flow = &req->flows[req->clear_tail];
4182        struct tid_rdma_params *remote;
4183        struct rvt_qp *qp = req->qp;
4184        struct hfi1_qp_priv *qpriv = qp->priv;
4185        u32 tidentry = flow->tid_entry[flow->tid_idx];
4186        u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT;
4187        struct tid_rdma_write_data *wd = &ohdr->u.tid_rdma.w_data;
4188        u32 next_offset, om = KDETH_OM_LARGE;
4189        bool last_pkt;
4190
4191        if (!tidlen) {
4192                hfi1_trdma_send_complete(qp, wqe, IB_WC_REM_INV_RD_REQ_ERR);
4193                rvt_error_qp(qp, IB_WC_REM_INV_RD_REQ_ERR);
4194        }
4195
4196        *len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset);
4197        flow->sent += *len;
4198        next_offset = flow->tid_offset + *len;
4199        last_pkt = (flow->tid_idx == (flow->tidcnt - 1) &&
4200                    next_offset >= tidlen) || (flow->sent >= flow->length);
4201        trace_hfi1_tid_entry_build_write_data(qp, flow->tid_idx, tidentry);
4202        trace_hfi1_tid_flow_build_write_data(qp, req->clear_tail, flow);
4203
4204        rcu_read_lock();
4205        remote = rcu_dereference(qpriv->tid_rdma.remote);
4206        KDETH_RESET(wd->kdeth0, KVER, 0x1);
4207        KDETH_SET(wd->kdeth0, SH, !last_pkt);
4208        KDETH_SET(wd->kdeth0, INTR, !!(!last_pkt && remote->urg));
4209        KDETH_SET(wd->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL));
4210        KDETH_SET(wd->kdeth0, TID, EXP_TID_GET(tidentry, IDX));
4211        KDETH_SET(wd->kdeth0, OM, om == KDETH_OM_LARGE);
4212        KDETH_SET(wd->kdeth0, OFFSET, flow->tid_offset / om);
4213        KDETH_RESET(wd->kdeth1, JKEY, remote->jkey);
4214        wd->verbs_qp = cpu_to_be32(qp->remote_qpn);
4215        rcu_read_unlock();
4216
4217        *bth1 = flow->tid_qpn;
4218        *bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) &
4219                         HFI1_KDETH_BTH_SEQ_MASK) |
4220                         (flow->flow_state.generation <<
4221                          HFI1_KDETH_BTH_SEQ_SHIFT));
4222        if (last_pkt) {
4223                /* PSNs are zero-based, so +1 to count number of packets */
4224                if (flow->flow_state.lpsn + 1 +
4225                    rvt_div_round_up_mtu(qp, req->seg_len) >
4226                    MAX_TID_FLOW_PSN)
4227                        req->state = TID_REQUEST_SYNC;
4228                *bth2 |= IB_BTH_REQ_ACK;
4229        }
4230
4231        if (next_offset >= tidlen) {
4232                flow->tid_offset = 0;
4233                flow->tid_idx++;
4234        } else {
4235                flow->tid_offset = next_offset;
4236        }
4237        return last_pkt;
4238}
4239
4240void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet)
4241{
4242        struct rvt_qp *qp = packet->qp;
4243        struct hfi1_qp_priv *priv = qp->priv;
4244        struct hfi1_ctxtdata *rcd = priv->rcd;
4245        struct ib_other_headers *ohdr = packet->ohdr;
4246        struct rvt_ack_entry *e;
4247        struct tid_rdma_request *req;
4248        struct tid_rdma_flow *flow;
4249        struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
4250        unsigned long flags;
4251        u32 psn, next;
4252        u8 opcode;
4253        bool fecn;
4254
4255        fecn = process_ecn(qp, packet);
4256        psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
4257        opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
4258
4259        /*
4260         * All error handling should be done by now. If we are here, the packet
4261         * is either good or been accepted by the error handler.
4262         */
4263        spin_lock_irqsave(&qp->s_lock, flags);
4264        e = &qp->s_ack_queue[priv->r_tid_tail];
4265        req = ack_to_tid_req(e);
4266        flow = &req->flows[req->clear_tail];
4267        if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.lpsn))) {
4268                update_r_next_psn_fecn(packet, priv, rcd, flow, fecn);
4269
4270                if (cmp_psn(psn, flow->flow_state.r_next_psn))
4271                        goto send_nak;
4272
4273                flow->flow_state.r_next_psn = mask_psn(psn + 1);
4274                /*
4275                 * Copy the payload to destination buffer if this packet is
4276                 * delivered as an eager packet due to RSM rule and FECN.
4277                 * The RSM rule selects FECN bit in BTH and SH bit in
4278                 * KDETH header and therefore will not match the last
4279                 * packet of each segment that has SH bit cleared.
4280                 */
4281                if (fecn && packet->etype == RHF_RCV_TYPE_EAGER) {
4282                        struct rvt_sge_state ss;
4283                        u32 len;
4284                        u32 tlen = packet->tlen;
4285                        u16 hdrsize = packet->hlen;
4286                        u8 pad = packet->pad;
4287                        u8 extra_bytes = pad + packet->extra_byte +
4288                                (SIZE_OF_CRC << 2);
4289                        u32 pmtu = qp->pmtu;
4290
4291                        if (unlikely(tlen != (hdrsize + pmtu + extra_bytes)))
4292                                goto send_nak;
4293                        len = req->comp_seg * req->seg_len;
4294                        len += delta_psn(psn,
4295                                full_flow_psn(flow, flow->flow_state.spsn)) *
4296                                pmtu;
4297                        if (unlikely(req->total_len - len < pmtu))
4298                                goto send_nak;
4299
4300                        /*
4301                         * The e->rdma_sge field is set when TID RDMA WRITE REQ
4302                         * is first received and is never modified thereafter.
4303                         */
4304                        ss.sge = e->rdma_sge;
4305                        ss.sg_list = NULL;
4306                        ss.num_sge = 1;
4307                        ss.total_len = req->total_len;
4308                        rvt_skip_sge(&ss, len, false);
4309                        rvt_copy_sge(qp, &ss, packet->payload, pmtu, false,
4310                                     false);
4311                        /* Raise the sw sequence check flag for next packet */
4312                        priv->r_next_psn_kdeth = mask_psn(psn + 1);
4313                        priv->s_flags |= HFI1_R_TID_SW_PSN;
4314                }
4315                goto exit;
4316        }
4317        flow->flow_state.r_next_psn = mask_psn(psn + 1);
4318        hfi1_kern_exp_rcv_clear(req);
4319        priv->alloc_w_segs--;
4320        rcd->flows[flow->idx].psn = psn & HFI1_KDETH_BTH_SEQ_MASK;
4321        req->comp_seg++;
4322        priv->s_nak_state = 0;
4323
4324        /*
4325         * Release the flow if one of the following conditions has been met:
4326         *  - The request has reached a sync point AND all outstanding
4327         *    segments have been completed, or
4328         *  - The entire request is complete and there are no more requests
4329         *    (of any kind) in the queue.
4330         */
4331        trace_hfi1_rsp_rcv_tid_write_data(qp, psn);
4332        trace_hfi1_tid_req_rcv_write_data(qp, 0, e->opcode, e->psn, e->lpsn,
4333                                          req);
4334        trace_hfi1_tid_write_rsp_rcv_data(qp);
4335        if (priv->r_tid_ack == HFI1_QP_WQE_INVALID)
4336                priv->r_tid_ack = priv->r_tid_tail;
4337
4338        if (opcode == TID_OP(WRITE_DATA_LAST)) {
4339                release_rdma_sge_mr(e);
4340                for (next = priv->r_tid_tail + 1; ; next++) {
4341                        if (next > rvt_size_atomic(&dev->rdi))
4342                                next = 0;
4343                        if (next == priv->r_tid_head)
4344                                break;
4345                        e = &qp->s_ack_queue[next];
4346                        if (e->opcode == TID_OP(WRITE_REQ))
4347                                break;
4348                }
4349                priv->r_tid_tail = next;
4350                if (++qp->s_acked_ack_queue > rvt_size_atomic(&dev->rdi))
4351                        qp->s_acked_ack_queue = 0;
4352        }
4353
4354        hfi1_tid_write_alloc_resources(qp, true);
4355
4356        /*
4357         * If we need to generate more responses, schedule the
4358         * send engine.
4359         */
4360        if (req->cur_seg < req->total_segs ||
4361            qp->s_tail_ack_queue != qp->r_head_ack_queue) {
4362                qp->s_flags |= RVT_S_RESP_PENDING;
4363                hfi1_schedule_send(qp);
4364        }
4365
4366        priv->pending_tid_w_segs--;
4367        if (priv->s_flags & HFI1_R_TID_RSC_TIMER) {
4368                if (priv->pending_tid_w_segs)
4369                        hfi1_mod_tid_reap_timer(req->qp);
4370                else
4371                        hfi1_stop_tid_reap_timer(req->qp);
4372        }
4373
4374done:
4375        priv->s_flags |= RVT_S_ACK_PENDING;
4376        hfi1_schedule_tid_send(qp);
4377exit:
4378        priv->r_next_psn_kdeth = flow->flow_state.r_next_psn;
4379        if (fecn)
4380                qp->s_flags |= RVT_S_ECN;
4381        spin_unlock_irqrestore(&qp->s_lock, flags);
4382        return;
4383
4384send_nak:
4385        if (!priv->s_nak_state) {
4386                priv->s_nak_state = IB_NAK_PSN_ERROR;
4387                priv->s_nak_psn = flow->flow_state.r_next_psn;
4388                priv->s_flags |= RVT_S_ACK_PENDING;
4389                if (priv->r_tid_ack == HFI1_QP_WQE_INVALID)
4390                        priv->r_tid_ack = priv->r_tid_tail;
4391                hfi1_schedule_tid_send(qp);
4392        }
4393        goto done;
4394}
4395
4396static bool hfi1_tid_rdma_is_resync_psn(u32 psn)
4397{
4398        return (bool)((psn & HFI1_KDETH_BTH_SEQ_MASK) ==
4399                      HFI1_KDETH_BTH_SEQ_MASK);
4400}
4401
4402u32 hfi1_build_tid_rdma_write_ack(struct rvt_qp *qp, struct rvt_ack_entry *e,
4403                                  struct ib_other_headers *ohdr, u16 iflow,
4404                                  u32 *bth1, u32 *bth2)
4405{
4406        struct hfi1_qp_priv *qpriv = qp->priv;
4407        struct tid_flow_state *fs = &qpriv->flow_state;
4408        struct tid_rdma_request *req = ack_to_tid_req(e);
4409        struct tid_rdma_flow *flow = &req->flows[iflow];
4410        struct tid_rdma_params *remote;
4411
4412        rcu_read_lock();
4413        remote = rcu_dereference(qpriv->tid_rdma.remote);
4414        KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth1, JKEY, remote->jkey);
4415        ohdr->u.tid_rdma.ack.verbs_qp = cpu_to_be32(qp->remote_qpn);
4416        *bth1 = remote->qp;
4417        rcu_read_unlock();
4418
4419        if (qpriv->resync) {
4420                *bth2 = mask_psn((fs->generation <<
4421                                  HFI1_KDETH_BTH_SEQ_SHIFT) - 1);
4422                ohdr->u.tid_rdma.ack.aeth = rvt_compute_aeth(qp);
4423        } else if (qpriv->s_nak_state) {
4424                *bth2 = mask_psn(qpriv->s_nak_psn);
4425                ohdr->u.tid_rdma.ack.aeth =
4426                        cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
4427                                    (qpriv->s_nak_state <<
4428                                     IB_AETH_CREDIT_SHIFT));
4429        } else {
4430                *bth2 = full_flow_psn(flow, flow->flow_state.lpsn);
4431                ohdr->u.tid_rdma.ack.aeth = rvt_compute_aeth(qp);
4432        }
4433        KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth0, KVER, 0x1);
4434        ohdr->u.tid_rdma.ack.tid_flow_qp =
4435                cpu_to_be32(qpriv->tid_rdma.local.qp |
4436                            ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) <<
4437                             TID_RDMA_DESTQP_FLOW_SHIFT) |
4438                            qpriv->rcd->ctxt);
4439
4440        ohdr->u.tid_rdma.ack.tid_flow_psn = 0;
4441        ohdr->u.tid_rdma.ack.verbs_psn =
4442                cpu_to_be32(flow->flow_state.resp_ib_psn);
4443
4444        if (qpriv->resync) {
4445                /*
4446                 * If the PSN before the current expect KDETH PSN is the
4447                 * RESYNC PSN, then we never received a good TID RDMA WRITE
4448                 * DATA packet after a previous RESYNC.
4449                 * In this case, the next expected KDETH PSN stays the same.
4450                 */
4451                if (hfi1_tid_rdma_is_resync_psn(qpriv->r_next_psn_kdeth - 1)) {
4452                        ohdr->u.tid_rdma.ack.tid_flow_psn =
4453                                cpu_to_be32(qpriv->r_next_psn_kdeth_save);
4454                } else {
4455                        /*
4456                         * Because the KDETH PSNs jump during a RESYNC, it's
4457                         * not possible to infer (or compute) the previous value
4458                         * of r_next_psn_kdeth in the case of back-to-back
4459                         * RESYNC packets. Therefore, we save it.
4460                         */
4461                        qpriv->r_next_psn_kdeth_save =
4462                                qpriv->r_next_psn_kdeth - 1;
4463                        ohdr->u.tid_rdma.ack.tid_flow_psn =
4464                                cpu_to_be32(qpriv->r_next_psn_kdeth_save);
4465                        qpriv->r_next_psn_kdeth = mask_psn(*bth2 + 1);
4466                }
4467                qpriv->resync = false;
4468        }
4469
4470        return sizeof(ohdr->u.tid_rdma.ack) / sizeof(u32);
4471}
4472
4473void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet)
4474{
4475        struct ib_other_headers *ohdr = packet->ohdr;
4476        struct rvt_qp *qp = packet->qp;
4477        struct hfi1_qp_priv *qpriv = qp->priv;
4478        struct rvt_swqe *wqe;
4479        struct tid_rdma_request *req;
4480        struct tid_rdma_flow *flow;
4481        u32 aeth, psn, req_psn, ack_psn, flpsn, resync_psn, ack_kpsn;
4482        unsigned long flags;
4483        u16 fidx;
4484
4485        trace_hfi1_tid_write_sender_rcv_tid_ack(qp, 0);
4486        process_ecn(qp, packet);
4487        psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
4488        aeth = be32_to_cpu(ohdr->u.tid_rdma.ack.aeth);
4489        req_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.verbs_psn));
4490        resync_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.tid_flow_psn));
4491
4492        spin_lock_irqsave(&qp->s_lock, flags);
4493        trace_hfi1_rcv_tid_ack(qp, aeth, psn, req_psn, resync_psn);
4494
4495        /* If we are waiting for an ACK to RESYNC, drop any other packets */
4496        if ((qp->s_flags & HFI1_S_WAIT_HALT) &&
4497            cmp_psn(psn, qpriv->s_resync_psn))
4498                goto ack_op_err;
4499
4500        ack_psn = req_psn;
4501        if (hfi1_tid_rdma_is_resync_psn(psn))
4502                ack_kpsn = resync_psn;
4503        else
4504                ack_kpsn = psn;
4505        if (aeth >> 29) {
4506                ack_psn--;
4507                ack_kpsn--;
4508        }
4509
4510        if (unlikely(qp->s_acked == qp->s_tail))
4511                goto ack_op_err;
4512
4513        wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
4514
4515        if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
4516                goto ack_op_err;
4517
4518        req = wqe_to_tid_req(wqe);
4519        trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn,
4520                                       wqe->lpsn, req);
4521        flow = &req->flows[req->acked_tail];
4522        trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow);
4523
4524        /* Drop stale ACK/NAK */
4525        if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.spsn)) < 0 ||
4526            cmp_psn(req_psn, flow->flow_state.resp_ib_psn) < 0)
4527                goto ack_op_err;
4528
4529        while (cmp_psn(ack_kpsn,
4530                       full_flow_psn(flow, flow->flow_state.lpsn)) >= 0 &&
4531               req->ack_seg < req->cur_seg) {
4532                req->ack_seg++;
4533                /* advance acked segment pointer */
4534                req->acked_tail = CIRC_NEXT(req->acked_tail, MAX_FLOWS);
4535                req->r_last_acked = flow->flow_state.resp_ib_psn;
4536                trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn,
4537                                               wqe->lpsn, req);
4538                if (req->ack_seg == req->total_segs) {
4539                        req->state = TID_REQUEST_COMPLETE;
4540                        wqe = do_rc_completion(qp, wqe,
4541                                               to_iport(qp->ibqp.device,
4542                                                        qp->port_num));
4543                        trace_hfi1_sender_rcv_tid_ack(qp);
4544                        atomic_dec(&qpriv->n_tid_requests);
4545                        if (qp->s_acked == qp->s_tail)
4546                                break;
4547                        if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
4548                                break;
4549                        req = wqe_to_tid_req(wqe);
4550                }
4551                flow = &req->flows[req->acked_tail];
4552                trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow);
4553        }
4554
4555        trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn,
4556                                       wqe->lpsn, req);
4557        switch (aeth >> 29) {
4558        case 0:         /* ACK */
4559                if (qpriv->s_flags & RVT_S_WAIT_ACK)
4560                        qpriv->s_flags &= ~RVT_S_WAIT_ACK;
4561                if (!hfi1_tid_rdma_is_resync_psn(psn)) {
4562                        /* Check if there is any pending TID ACK */
4563                        if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
4564                            req->ack_seg < req->cur_seg)
4565                                hfi1_mod_tid_retry_timer(qp);
4566                        else
4567                                hfi1_stop_tid_retry_timer(qp);
4568                        hfi1_schedule_send(qp);
4569                } else {
4570                        u32 spsn, fpsn, last_acked, generation;
4571                        struct tid_rdma_request *rptr;
4572
4573                        /* ACK(RESYNC) */
4574                        hfi1_stop_tid_retry_timer(qp);
4575                        /* Allow new requests (see hfi1_make_tid_rdma_pkt) */
4576                        qp->s_flags &= ~HFI1_S_WAIT_HALT;
4577                        /*
4578                         * Clear RVT_S_SEND_ONE flag in case that the TID RDMA
4579                         * ACK is received after the TID retry timer is fired
4580                         * again. In this case, do not send any more TID
4581                         * RESYNC request or wait for any more TID ACK packet.
4582                         */
4583                        qpriv->s_flags &= ~RVT_S_SEND_ONE;
4584                        hfi1_schedule_send(qp);
4585
4586                        if ((qp->s_acked == qpriv->s_tid_tail &&
4587                             req->ack_seg == req->total_segs) ||
4588                            qp->s_acked == qp->s_tail) {
4589                                qpriv->s_state = TID_OP(WRITE_DATA_LAST);
4590                                goto done;
4591                        }
4592
4593                        if (req->ack_seg == req->comp_seg) {
4594                                qpriv->s_state = TID_OP(WRITE_DATA);
4595                                goto done;
4596                        }
4597
4598                        /*
4599                         * The PSN to start with is the next PSN after the
4600                         * RESYNC PSN.
4601                         */
4602                        psn = mask_psn(psn + 1);
4603                        generation = psn >> HFI1_KDETH_BTH_SEQ_SHIFT;
4604                        spsn = 0;
4605
4606                        /*
4607                         * Update to the correct WQE when we get an ACK(RESYNC)
4608                         * in the middle of a request.
4609                         */
4610                        if (delta_psn(ack_psn, wqe->lpsn))
4611                                wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
4612                        req = wqe_to_tid_req(wqe);
4613                        flow = &req->flows[req->acked_tail];
4614                        /*
4615                         * RESYNC re-numbers the PSN ranges of all remaining
4616                         * segments. Also, PSN's start from 0 in the middle of a
4617                         * segment and the first segment size is less than the
4618                         * default number of packets. flow->resync_npkts is used
4619                         * to track the number of packets from the start of the
4620                         * real segment to the point of 0 PSN after the RESYNC
4621                         * in order to later correctly rewind the SGE.
4622                         */
4623                        fpsn = full_flow_psn(flow, flow->flow_state.spsn);
4624                        req->r_ack_psn = psn;
4625                        flow->resync_npkts +=
4626                                delta_psn(mask_psn(resync_psn + 1), fpsn);
4627                        /*
4628                         * Renumber all packet sequence number ranges
4629                         * based on the new generation.
4630                         */
4631                        last_acked = qp->s_acked;
4632                        rptr = req;
4633                        while (1) {
4634                                /* start from last acked segment */
4635                                for (fidx = rptr->acked_tail;
4636                                     CIRC_CNT(rptr->setup_head, fidx,
4637                                              MAX_FLOWS);
4638                                     fidx = CIRC_NEXT(fidx, MAX_FLOWS)) {
4639                                        u32 lpsn;
4640                                        u32 gen;
4641
4642                                        flow = &rptr->flows[fidx];
4643                                        gen = flow->flow_state.generation;
4644                                        if (WARN_ON(gen == generation &&
4645                                                    flow->flow_state.spsn !=
4646                                                     spsn))
4647                                                continue;
4648                                        lpsn = flow->flow_state.lpsn;
4649                                        lpsn = full_flow_psn(flow, lpsn);
4650                                        flow->npkts =
4651                                                delta_psn(lpsn,
4652                                                          mask_psn(resync_psn)
4653                                                          );
4654                                        flow->flow_state.generation =
4655                                                generation;
4656                                        flow->flow_state.spsn = spsn;
4657                                        flow->flow_state.lpsn =
4658                                                flow->flow_state.spsn +
4659                                                flow->npkts - 1;
4660                                        flow->pkt = 0;
4661                                        spsn += flow->npkts;
4662                                        resync_psn += flow->npkts;
4663                                        trace_hfi1_tid_flow_rcv_tid_ack(qp,
4664                                                                        fidx,
4665                                                                        flow);
4666                                }
4667                                if (++last_acked == qpriv->s_tid_cur + 1)
4668                                        break;
4669                                if (last_acked == qp->s_size)
4670                                        last_acked = 0;
4671                                wqe = rvt_get_swqe_ptr(qp, last_acked);
4672                                rptr = wqe_to_tid_req(wqe);
4673                        }
4674                        req->cur_seg = req->ack_seg;
4675                        qpriv->s_tid_tail = qp->s_acked;
4676                        qpriv->s_state = TID_OP(WRITE_REQ);
4677                        hfi1_schedule_tid_send(qp);
4678                }
4679done:
4680                qpriv->s_retry = qp->s_retry_cnt;
4681                break;
4682
4683        case 3:         /* NAK */
4684                hfi1_stop_tid_retry_timer(qp);
4685                switch ((aeth >> IB_AETH_CREDIT_SHIFT) &
4686                        IB_AETH_CREDIT_MASK) {
4687                case 0: /* PSN sequence error */
4688                        if (!req->flows)
4689                                break;
4690                        flow = &req->flows[req->acked_tail];
4691                        flpsn = full_flow_psn(flow, flow->flow_state.lpsn);
4692                        if (cmp_psn(psn, flpsn) > 0)
4693                                break;
4694                        trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail,
4695                                                        flow);
4696                        req->r_ack_psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
4697                        req->cur_seg = req->ack_seg;
4698                        qpriv->s_tid_tail = qp->s_acked;
4699                        qpriv->s_state = TID_OP(WRITE_REQ);
4700                        qpriv->s_retry = qp->s_retry_cnt;
4701                        hfi1_schedule_tid_send(qp);
4702                        break;
4703
4704                default:
4705                        break;
4706                }
4707                break;
4708
4709        default:
4710                break;
4711        }
4712
4713ack_op_err:
4714        spin_unlock_irqrestore(&qp->s_lock, flags);
4715}
4716
4717void hfi1_add_tid_retry_timer(struct rvt_qp *qp)
4718{
4719        struct hfi1_qp_priv *priv = qp->priv;
4720        struct ib_qp *ibqp = &qp->ibqp;
4721        struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
4722
4723        lockdep_assert_held(&qp->s_lock);
4724        if (!(priv->s_flags & HFI1_S_TID_RETRY_TIMER)) {
4725                priv->s_flags |= HFI1_S_TID_RETRY_TIMER;
4726                priv->s_tid_retry_timer.expires = jiffies +
4727                        priv->tid_retry_timeout_jiffies + rdi->busy_jiffies;
4728                add_timer(&priv->s_tid_retry_timer);
4729        }
4730}
4731
4732static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp)
4733{
4734        struct hfi1_qp_priv *priv = qp->priv;
4735        struct ib_qp *ibqp = &qp->ibqp;
4736        struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
4737
4738        lockdep_assert_held(&qp->s_lock);
4739        priv->s_flags |= HFI1_S_TID_RETRY_TIMER;
4740        mod_timer(&priv->s_tid_retry_timer, jiffies +
4741                  priv->tid_retry_timeout_jiffies + rdi->busy_jiffies);
4742}
4743
4744static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp)
4745{
4746        struct hfi1_qp_priv *priv = qp->priv;
4747        int rval = 0;
4748
4749        lockdep_assert_held(&qp->s_lock);
4750        if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) {
4751                rval = del_timer(&priv->s_tid_retry_timer);
4752                priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER;
4753        }
4754        return rval;
4755}
4756
4757void hfi1_del_tid_retry_timer(struct rvt_qp *qp)
4758{
4759        struct hfi1_qp_priv *priv = qp->priv;
4760
4761        del_timer_sync(&priv->s_tid_retry_timer);
4762        priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER;
4763}
4764
4765static void hfi1_tid_retry_timeout(struct timer_list *t)
4766{
4767        struct hfi1_qp_priv *priv = from_timer(priv, t, s_tid_retry_timer);
4768        struct rvt_qp *qp = priv->owner;
4769        struct rvt_swqe *wqe;
4770        unsigned long flags;
4771        struct tid_rdma_request *req;
4772
4773        spin_lock_irqsave(&qp->r_lock, flags);
4774        spin_lock(&qp->s_lock);
4775        trace_hfi1_tid_write_sender_retry_timeout(qp, 0);
4776        if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) {
4777                hfi1_stop_tid_retry_timer(qp);
4778                if (!priv->s_retry) {
4779                        trace_hfi1_msg_tid_retry_timeout(/* msg */
4780                                qp,
4781                                "Exhausted retries. Tid retry timeout = ",
4782                                (u64)priv->tid_retry_timeout_jiffies);
4783
4784                        wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
4785                        hfi1_trdma_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
4786                        rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
4787                } else {
4788                        wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
4789                        req = wqe_to_tid_req(wqe);
4790                        trace_hfi1_tid_req_tid_retry_timeout(/* req */
4791                           qp, 0, wqe->wr.opcode, wqe->psn, wqe->lpsn, req);
4792
4793                        priv->s_flags &= ~RVT_S_WAIT_ACK;
4794                        /* Only send one packet (the RESYNC) */
4795                        priv->s_flags |= RVT_S_SEND_ONE;
4796                        /*
4797                         * No additional request shall be made by this QP until
4798                         * the RESYNC has been complete.
4799                         */
4800                        qp->s_flags |= HFI1_S_WAIT_HALT;
4801                        priv->s_state = TID_OP(RESYNC);
4802                        priv->s_retry--;
4803                        hfi1_schedule_tid_send(qp);
4804                }
4805        }
4806        spin_unlock(&qp->s_lock);
4807        spin_unlock_irqrestore(&qp->r_lock, flags);
4808}
4809
4810u32 hfi1_build_tid_rdma_resync(struct rvt_qp *qp, struct rvt_swqe *wqe,
4811                               struct ib_other_headers *ohdr, u32 *bth1,
4812                               u32 *bth2, u16 fidx)
4813{
4814        struct hfi1_qp_priv *qpriv = qp->priv;
4815        struct tid_rdma_params *remote;
4816        struct tid_rdma_request *req = wqe_to_tid_req(wqe);
4817        struct tid_rdma_flow *flow = &req->flows[fidx];
4818        u32 generation;
4819
4820        rcu_read_lock();
4821        remote = rcu_dereference(qpriv->tid_rdma.remote);
4822        KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth1, JKEY, remote->jkey);
4823        ohdr->u.tid_rdma.ack.verbs_qp = cpu_to_be32(qp->remote_qpn);
4824        *bth1 = remote->qp;
4825        rcu_read_unlock();
4826
4827        generation = kern_flow_generation_next(flow->flow_state.generation);
4828        *bth2 = mask_psn((generation << HFI1_KDETH_BTH_SEQ_SHIFT) - 1);
4829        qpriv->s_resync_psn = *bth2;
4830        *bth2 |= IB_BTH_REQ_ACK;
4831        KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth0, KVER, 0x1);
4832
4833        return sizeof(ohdr->u.tid_rdma.resync) / sizeof(u32);
4834}
4835
4836void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet)
4837{
4838        struct ib_other_headers *ohdr = packet->ohdr;
4839        struct rvt_qp *qp = packet->qp;
4840        struct hfi1_qp_priv *qpriv = qp->priv;
4841        struct hfi1_ctxtdata *rcd = qpriv->rcd;
4842        struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
4843        struct rvt_ack_entry *e;
4844        struct tid_rdma_request *req;
4845        struct tid_rdma_flow *flow;
4846        struct tid_flow_state *fs = &qpriv->flow_state;
4847        u32 psn, generation, idx, gen_next;
4848        bool fecn;
4849        unsigned long flags;
4850
4851        fecn = process_ecn(qp, packet);
4852        psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
4853
4854        generation = mask_psn(psn + 1) >> HFI1_KDETH_BTH_SEQ_SHIFT;
4855        spin_lock_irqsave(&qp->s_lock, flags);
4856
4857        gen_next = (fs->generation == KERN_GENERATION_RESERVED) ?
4858                generation : kern_flow_generation_next(fs->generation);
4859        /*
4860         * RESYNC packet contains the "next" generation and can only be
4861         * from the current or previous generations
4862         */
4863        if (generation != mask_generation(gen_next - 1) &&
4864            generation != gen_next)
4865                goto bail;
4866        /* Already processing a resync */
4867        if (qpriv->resync)
4868                goto bail;
4869
4870        spin_lock(&rcd->exp_lock);
4871        if (fs->index >= RXE_NUM_TID_FLOWS) {
4872                /*
4873                 * If we don't have a flow, save the generation so it can be
4874                 * applied when a new flow is allocated
4875                 */
4876                fs->generation = generation;
4877        } else {
4878                /* Reprogram the QP flow with new generation */
4879                rcd->flows[fs->index].generation = generation;
4880                fs->generation = kern_setup_hw_flow(rcd, fs->index);
4881        }
4882        fs->psn = 0;
4883        /*
4884         * Disable SW PSN checking since a RESYNC is equivalent to a
4885         * sync point and the flow has/will be reprogrammed
4886         */
4887        qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
4888        trace_hfi1_tid_write_rsp_rcv_resync(qp);
4889
4890        /*
4891         * Reset all TID flow information with the new generation.
4892         * This is done for all requests and segments after the
4893         * last received segment
4894         */
4895        for (idx = qpriv->r_tid_tail; ; idx++) {
4896                u16 flow_idx;
4897
4898                if (idx > rvt_size_atomic(&dev->rdi))
4899                        idx = 0;
4900                e = &qp->s_ack_queue[idx];
4901                if (e->opcode == TID_OP(WRITE_REQ)) {
4902                        req = ack_to_tid_req(e);
4903                        trace_hfi1_tid_req_rcv_resync(qp, 0, e->opcode, e->psn,
4904                                                      e->lpsn, req);
4905
4906                        /* start from last unacked segment */
4907                        for (flow_idx = req->clear_tail;
4908                             CIRC_CNT(req->setup_head, flow_idx,
4909                                      MAX_FLOWS);
4910                             flow_idx = CIRC_NEXT(flow_idx, MAX_FLOWS)) {
4911                                u32 lpsn;
4912                                u32 next;
4913
4914                                flow = &req->flows[flow_idx];
4915                                lpsn = full_flow_psn(flow,
4916                                                     flow->flow_state.lpsn);
4917                                next = flow->flow_state.r_next_psn;
4918                                flow->npkts = delta_psn(lpsn, next - 1);
4919                                flow->flow_state.generation = fs->generation;
4920                                flow->flow_state.spsn = fs->psn;
4921                                flow->flow_state.lpsn =
4922                                        flow->flow_state.spsn + flow->npkts - 1;
4923                                flow->flow_state.r_next_psn =
4924                                        full_flow_psn(flow,
4925                                                      flow->flow_state.spsn);
4926                                fs->psn += flow->npkts;
4927                                trace_hfi1_tid_flow_rcv_resync(qp, flow_idx,
4928                                                               flow);
4929                        }
4930                }
4931                if (idx == qp->s_tail_ack_queue)
4932                        break;
4933        }
4934
4935        spin_unlock(&rcd->exp_lock);
4936        qpriv->resync = true;
4937        /* RESYNC request always gets a TID RDMA ACK. */
4938        qpriv->s_nak_state = 0;
4939        qpriv->s_flags |= RVT_S_ACK_PENDING;
4940        hfi1_schedule_tid_send(qp);
4941bail:
4942        if (fecn)
4943                qp->s_flags |= RVT_S_ECN;
4944        spin_unlock_irqrestore(&qp->s_lock, flags);
4945}
4946
4947/*
4948 * Call this function when the last TID RDMA WRITE DATA packet for a request
4949 * is built.
4950 */
4951static void update_tid_tail(struct rvt_qp *qp)
4952        __must_hold(&qp->s_lock)
4953{
4954        struct hfi1_qp_priv *priv = qp->priv;
4955        u32 i;
4956        struct rvt_swqe *wqe;
4957
4958        lockdep_assert_held(&qp->s_lock);
4959        /* Can't move beyond s_tid_cur */
4960        if (priv->s_tid_tail == priv->s_tid_cur)
4961                return;
4962        for (i = priv->s_tid_tail + 1; ; i++) {
4963                if (i == qp->s_size)
4964                        i = 0;
4965
4966                if (i == priv->s_tid_cur)
4967                        break;
4968                wqe = rvt_get_swqe_ptr(qp, i);
4969                if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
4970                        break;
4971        }
4972        priv->s_tid_tail = i;
4973        priv->s_state = TID_OP(WRITE_RESP);
4974}
4975
4976int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
4977        __must_hold(&qp->s_lock)
4978{
4979        struct hfi1_qp_priv *priv = qp->priv;
4980        struct rvt_swqe *wqe;
4981        u32 bth1 = 0, bth2 = 0, hwords = 5, len, middle = 0;
4982        struct ib_other_headers *ohdr;
4983        struct rvt_sge_state *ss = &qp->s_sge;
4984        struct rvt_ack_entry *e = &qp->s_ack_queue[qp->s_tail_ack_queue];
4985        struct tid_rdma_request *req = ack_to_tid_req(e);
4986        bool last = false;
4987        u8 opcode = TID_OP(WRITE_DATA);
4988
4989        lockdep_assert_held(&qp->s_lock);
4990        trace_hfi1_tid_write_sender_make_tid_pkt(qp, 0);
4991        /*
4992         * Prioritize the sending of the requests and responses over the
4993         * sending of the TID RDMA data packets.
4994         */
4995        if (((atomic_read(&priv->n_tid_requests) < HFI1_TID_RDMA_WRITE_CNT) &&
4996             atomic_read(&priv->n_requests) &&
4997             !(qp->s_flags & (RVT_S_BUSY | RVT_S_WAIT_ACK |
4998                             HFI1_S_ANY_WAIT_IO))) ||
4999            (e->opcode == TID_OP(WRITE_REQ) && req->cur_seg < req->alloc_seg &&
5000             !(qp->s_flags & (RVT_S_BUSY | HFI1_S_ANY_WAIT_IO)))) {

5001                struct iowait_work *iowork;
5002
5003                iowork = iowait_get_ib_work(&priv->s_iowait);
5004                ps->s_txreq = get_waiting_verbs_txreq(iowork);
5005                if (ps->s_txreq || hfi1_make_rc_req(qp, ps)) {
5006                        priv->s_flags |= HFI1_S_TID_BUSY_SET;
5007                        return 1;
5008                }
5009        }
5010
5011        ps->s_txreq = get_txreq(ps->dev, qp);
5012        if (!ps->s_txreq)
5013                goto bail_no_tx;
5014
5015        ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth;
5016
5017        if ((priv->s_flags & RVT_S_ACK_PENDING) &&
5018            make_tid_rdma_ack(qp, ohdr, ps))
5019                return 1;
5020
5021        /*
5022         * Bail out if we can't send data.
5023         * Be reminded that this check must been done after the call to
5024         * make_tid_rdma_ack() because the responding QP could be in
5025         * RTR state where it can send TID RDMA ACK, not TID RDMA WRITE DATA.
5026         */
5027        if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK))
5028                goto bail;
5029
5030        if (priv->s_flags & RVT_S_WAIT_ACK)
5031                goto bail;
5032
5033        /* Check whether there is anything to do. */
5034        if (priv->s_tid_tail == HFI1_QP_WQE_INVALID)
5035                goto bail;
5036        wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail);
5037        req = wqe_to_tid_req(wqe);
5038        trace_hfi1_tid_req_make_tid_pkt(qp, 0, wqe->wr.opcode, wqe->psn,
5039                                        wqe->lpsn, req);
5040        switch (priv->s_state) {
5041        case TID_OP(WRITE_REQ):
5042        case TID_OP(WRITE_RESP):
5043                priv->tid_ss.sge = wqe->sg_list[0];
5044                priv->tid_ss.sg_list = wqe->sg_list + 1;
5045                priv->tid_ss.num_sge = wqe->wr.num_sge;
5046                priv->tid_ss.total_len = wqe->length;
5047
5048                if (priv->s_state == TID_OP(WRITE_REQ))
5049                        hfi1_tid_rdma_restart_req(qp, wqe, &bth2);
5050                priv->s_state = TID_OP(WRITE_DATA);
5051                /* fall through */
5052
5053        case TID_OP(WRITE_DATA):
5054                /*
5055                 * 1. Check whether TID RDMA WRITE RESP available.
5056                 * 2. If no:
5057                 *    2.1 If have more segments and no TID RDMA WRITE RESP,
5058                 *        set HFI1_S_WAIT_TID_RESP
5059                 *    2.2 Return indicating no progress made.
5060                 * 3. If yes:
5061                 *    3.1 Build TID RDMA WRITE DATA packet.
5062                 *    3.2 If last packet in segment:
5063                 *        3.2.1 Change KDETH header bits
5064                 *        3.2.2 Advance RESP pointers.
5065                 *    3.3 Return indicating progress made.
5066                 */
5067                trace_hfi1_sender_make_tid_pkt(qp);
5068                trace_hfi1_tid_write_sender_make_tid_pkt(qp, 0);
5069                wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail);
5070                req = wqe_to_tid_req(wqe);
5071                len = wqe->length;
5072
5073                if (!req->comp_seg || req->cur_seg == req->comp_seg)
5074                        goto bail;
5075
5076                trace_hfi1_tid_req_make_tid_pkt(qp, 0, wqe->wr.opcode,
5077                                                wqe->psn, wqe->lpsn, req);
5078                last = hfi1_build_tid_rdma_packet(wqe, ohdr, &bth1, &bth2,
5079                                                  &len);
5080
5081                if (last) {
5082                        /* move pointer to next flow */
5083                        req->clear_tail = CIRC_NEXT(req->clear_tail,
5084                                                    MAX_FLOWS);
5085                        if (++req->cur_seg < req->total_segs) {
5086                                if (!CIRC_CNT(req->setup_head, req->clear_tail,
5087                                              MAX_FLOWS))
5088                                        qp->s_flags |= HFI1_S_WAIT_TID_RESP;
5089                        } else {
5090                                priv->s_state = TID_OP(WRITE_DATA_LAST);
5091                                opcode = TID_OP(WRITE_DATA_LAST);
5092
5093                                /* Advance the s_tid_tail now */
5094                                update_tid_tail(qp);
5095                        }
5096                }
5097                hwords += sizeof(ohdr->u.tid_rdma.w_data) / sizeof(u32);
5098                ss = &priv->tid_ss;
5099                break;
5100
5101        case TID_OP(RESYNC):
5102                trace_hfi1_sender_make_tid_pkt(qp);
5103                /* Use generation from the most recently received response */
5104                wqe = rvt_get_swqe_ptr(qp, priv->s_tid_cur);
5105                req = wqe_to_tid_req(wqe);
5106                /* If no responses for this WQE look at the previous one */
5107                if (!req->comp_seg) {
5108                        wqe = rvt_get_swqe_ptr(qp,
5109                                               (!priv->s_tid_cur ? qp->s_size :
5110                                                priv->s_tid_cur) - 1);
5111                        req = wqe_to_tid_req(wqe);
5112                }
5113                hwords += hfi1_build_tid_rdma_resync(qp, wqe, ohdr, &bth1,
5114                                                     &bth2,
5115                                                     CIRC_PREV(req->setup_head,
5116                                                               MAX_FLOWS));
5117                ss = NULL;
5118                len = 0;
5119                opcode = TID_OP(RESYNC);
5120                break;
5121
5122        default:
5123                goto bail;
5124        }
5125        if (priv->s_flags & RVT_S_SEND_ONE) {
5126                priv->s_flags &= ~RVT_S_SEND_ONE;
5127                priv->s_flags |= RVT_S_WAIT_ACK;
5128                bth2 |= IB_BTH_REQ_ACK;
5129        }
5130        qp->s_len -= len;
5131        ps->s_txreq->hdr_dwords = hwords;
5132        ps->s_txreq->sde = priv->s_sde;
5133        ps->s_txreq->ss = ss;
5134        ps->s_txreq->s_cur_size = len;
5135        hfi1_make_ruc_header(qp, ohdr, (opcode << 24), bth1, bth2,
5136                             middle, ps);
5137        return 1;
5138bail:
5139        hfi1_put_txreq(ps->s_txreq);
5140bail_no_tx:
5141        ps->s_txreq = NULL;
5142        priv->s_flags &= ~RVT_S_BUSY;
5143        /*
5144         * If we didn't get a txreq, the QP will be woken up later to try
5145         * again, set the flags to the the wake up which work item to wake
5146         * up.
5147         * (A better algorithm should be found to do this and generalize the
5148         * sleep/wakeup flags.)
5149         */
5150        iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
5151        return 0;
5152}
5153
5154static int make_tid_rdma_ack(struct rvt_qp *qp,
5155                             struct ib_other_headers *ohdr,
5156                             struct hfi1_pkt_state *ps)
5157{
5158        struct rvt_ack_entry *e;
5159        struct hfi1_qp_priv *qpriv = qp->priv;
5160        struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
5161        u32 hwords, next;
5162        u32 len = 0;
5163        u32 bth1 = 0, bth2 = 0;
5164        int middle = 0;
5165        u16 flow;
5166        struct tid_rdma_request *req, *nreq;
5167
5168        trace_hfi1_tid_write_rsp_make_tid_ack(qp);
5169        /* Don't send an ACK if we aren't supposed to. */
5170        if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
5171                goto bail;
5172
5173        /* header size in 32-bit words LRH+BTH = (8+12)/4. */
5174        hwords = 5;
5175
5176        e = &qp->s_ack_queue[qpriv->r_tid_ack];
5177        req = ack_to_tid_req(e);
5178        /*
5179         * In the RESYNC case, we are exactly one segment past the
5180         * previously sent ack or at the previously sent NAK. So to send
5181         * the resync ack, we go back one segment (which might be part of
5182         * the previous request) and let the do-while loop execute again.
5183         * The advantage of executing the do-while loop is that any data
5184         * received after the previous ack is automatically acked in the
5185         * RESYNC ack. It turns out that for the do-while loop we only need
5186         * to pull back qpriv->r_tid_ack, not the segment
5187         * indices/counters. The scheme works even if the previous request
5188         * was not a TID WRITE request.
5189         */
5190        if (qpriv->resync) {
5191                if (!req->ack_seg || req->ack_seg == req->total_segs)
5192                        qpriv->r_tid_ack = !qpriv->r_tid_ack ?
5193                                rvt_size_atomic(&dev->rdi) :
5194                                qpriv->r_tid_ack - 1;
5195                e = &qp->s_ack_queue[qpriv->r_tid_ack];
5196                req = ack_to_tid_req(e);
5197        }
5198
5199        trace_hfi1_rsp_make_tid_ack(qp, e->psn);
5200        trace_hfi1_tid_req_make_tid_ack(qp, 0, e->opcode, e->psn, e->lpsn,
5201                                        req);
5202        /*
5203         * If we've sent all the ACKs that we can, we are done
5204         * until we get more segments...
5205         */
5206        if (!qpriv->s_nak_state && !qpriv->resync &&
5207            req->ack_seg == req->comp_seg)
5208                goto bail;
5209
5210        do {
5211                /*
5212                 * To deal with coalesced ACKs, the acked_tail pointer
5213                 * into the flow array is used. The distance between it
5214                 * and the clear_tail is the number of flows that are
5215                 * being ACK'ed.
5216                 */
5217                req->ack_seg +=
5218                        /* Get up-to-date value */
5219                        CIRC_CNT(req->clear_tail, req->acked_tail,
5220                                 MAX_FLOWS);
5221                /* Advance acked index */
5222                req->acked_tail = req->clear_tail;
5223
5224                /*
5225                 * req->clear_tail points to the segment currently being
5226                 * received. So, when sending an ACK, the previous
5227                 * segment is being ACK'ed.
5228                 */
5229                flow = CIRC_PREV(req->acked_tail, MAX_FLOWS);
5230                if (req->ack_seg != req->total_segs)
5231                        break;
5232                req->state = TID_REQUEST_COMPLETE;
5233
5234                next = qpriv->r_tid_ack + 1;
5235                if (next > rvt_size_atomic(&dev->rdi))
5236                        next = 0;
5237                qpriv->r_tid_ack = next;
5238                if (qp->s_ack_queue[next].opcode != TID_OP(WRITE_REQ))
5239                        break;
5240                nreq = ack_to_tid_req(&qp->s_ack_queue[next]);
5241                if (!nreq->comp_seg || nreq->ack_seg == nreq->comp_seg)
5242                        break;
5243
5244                /* Move to the next ack entry now */
5245                e = &qp->s_ack_queue[qpriv->r_tid_ack];
5246                req = ack_to_tid_req(e);
5247        } while (1);
5248
5249        /*
5250         * At this point qpriv->r_tid_ack == qpriv->r_tid_tail but e and
5251         * req could be pointing at the previous ack queue entry
5252         */
5253        if (qpriv->s_nak_state ||
5254            (qpriv->resync &&
5255             !hfi1_tid_rdma_is_resync_psn(qpriv->r_next_psn_kdeth - 1) &&
5256             (cmp_psn(qpriv->r_next_psn_kdeth - 1,
5257                      full_flow_psn(&req->flows[flow],
5258                                    req->flows[flow].flow_state.lpsn)) > 0))) {
5259                /*
5260                 * A NAK will implicitly acknowledge all previous TID RDMA
5261                 * requests. Therefore, we NAK with the req->acked_tail
5262                 * segment for the request at qpriv->r_tid_ack (same at
5263                 * this point as the req->clear_tail segment for the
5264                 * qpriv->r_tid_tail request)
5265                 */
5266                e = &qp->s_ack_queue[qpriv->r_tid_ack];
5267                req = ack_to_tid_req(e);
5268                flow = req->acked_tail;
5269        } else if (req->ack_seg == req->total_segs &&
5270                   qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK)
5271                qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK;
5272
5273        trace_hfi1_tid_write_rsp_make_tid_ack(qp);
5274        trace_hfi1_tid_req_make_tid_ack(qp, 0, e->opcode, e->psn, e->lpsn,
5275                                        req);
5276        hwords += hfi1_build_tid_rdma_write_ack(qp, e, ohdr, flow, &bth1,
5277                                                &bth2);
5278        len = 0;
5279        qpriv->s_flags &= ~RVT_S_ACK_PENDING;
5280        ps->s_txreq->hdr_dwords = hwords;
5281        ps->s_txreq->sde = qpriv->s_sde;
5282        ps->s_txreq->s_cur_size = len;
5283        ps->s_txreq->ss = NULL;
5284        hfi1_make_ruc_header(qp, ohdr, (TID_OP(ACK) << 24), bth1, bth2, middle,
5285                             ps);
5286        ps->s_txreq->txreq.flags |= SDMA_TXREQ_F_VIP;
5287        return 1;
5288bail:
5289        /*
5290         * Ensure s_rdma_ack_cnt changes are committed prior to resetting
5291         * RVT_S_RESP_PENDING
5292         */
5293        smp_wmb();
5294        qpriv->s_flags &= ~RVT_S_ACK_PENDING;
5295        return 0;
5296}
5297
5298static int hfi1_send_tid_ok(struct rvt_qp *qp)
5299{
5300        struct hfi1_qp_priv *priv = qp->priv;
5301
5302        return !(priv->s_flags & RVT_S_BUSY ||
5303                 qp->s_flags & HFI1_S_ANY_WAIT_IO) &&
5304                (verbs_txreq_queued(iowait_get_tid_work(&priv->s_iowait)) ||
5305                 (priv->s_flags & RVT_S_RESP_PENDING) ||
5306                 !(qp->s_flags & HFI1_S_ANY_TID_WAIT_SEND));
5307}
5308
5309void _hfi1_do_tid_send(struct work_struct *work)
5310{
5311        struct iowait_work *w = container_of(work, struct iowait_work, iowork);
5312        struct rvt_qp *qp = iowait_to_qp(w->iow);
5313
5314        hfi1_do_tid_send(qp);
5315}
5316
5317static void hfi1_do_tid_send(struct rvt_qp *qp)
5318{
5319        struct hfi1_pkt_state ps;
5320        struct hfi1_qp_priv *priv = qp->priv;
5321
5322        ps.dev = to_idev(qp->ibqp.device);
5323        ps.ibp = to_iport(qp->ibqp.device, qp->port_num);
5324        ps.ppd = ppd_from_ibp(ps.ibp);
5325        ps.wait = iowait_get_tid_work(&priv->s_iowait);
5326        ps.in_thread = false;
5327        ps.timeout_int = qp->timeout_jiffies / 8;
5328
5329        trace_hfi1_rc_do_tid_send(qp, false);
5330        spin_lock_irqsave(&qp->s_lock, ps.flags);
5331
5332        /* Return if we are already busy processing a work request. */
5333        if (!hfi1_send_tid_ok(qp)) {
5334                if (qp->s_flags & HFI1_S_ANY_WAIT_IO)
5335                        iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
5336                spin_unlock_irqrestore(&qp->s_lock, ps.flags);
5337                return;
5338        }
5339
5340        priv->s_flags |= RVT_S_BUSY;
5341
5342        ps.timeout = jiffies + ps.timeout_int;
5343        ps.cpu = priv->s_sde ? priv->s_sde->cpu :
5344                cpumask_first(cpumask_of_node(ps.ppd->dd->node));
5345        ps.pkts_sent = false;
5346
5347        /* insure a pre-built packet is handled  */
5348        ps.s_txreq = get_waiting_verbs_txreq(ps.wait);
5349        do {
5350                /* Check for a constructed packet to be sent. */
5351                if (ps.s_txreq) {
5352                        if (priv->s_flags & HFI1_S_TID_BUSY_SET) {
5353                                qp->s_flags |= RVT_S_BUSY;
5354                                ps.wait = iowait_get_ib_work(&priv->s_iowait);
5355                        }
5356                        spin_unlock_irqrestore(&qp->s_lock, ps.flags);
5357
5358                        /*
5359                         * If the packet cannot be sent now, return and
5360                         * the send tasklet will be woken up later.
5361                         */
5362                        if (hfi1_verbs_send(qp, &ps))
5363                                return;
5364
5365                        /* allow other tasks to run */
5366                        if (hfi1_schedule_send_yield(qp, &ps, true))
5367                                return;
5368
5369                        spin_lock_irqsave(&qp->s_lock, ps.flags);
5370                        if (priv->s_flags & HFI1_S_TID_BUSY_SET) {
5371                                qp->s_flags &= ~RVT_S_BUSY;
5372                                priv->s_flags &= ~HFI1_S_TID_BUSY_SET;
5373                                ps.wait = iowait_get_tid_work(&priv->s_iowait);
5374                                if (iowait_flag_set(&priv->s_iowait,
5375                                                    IOWAIT_PENDING_IB))
5376                                        hfi1_schedule_send(qp);
5377                        }
5378                }
5379        } while (hfi1_make_tid_rdma_pkt(qp, &ps));
5380        iowait_starve_clear(ps.pkts_sent, &priv->s_iowait);
5381        spin_unlock_irqrestore(&qp->s_lock, ps.flags);
5382}
5383
5384static bool _hfi1_schedule_tid_send(struct rvt_qp *qp)
5385{
5386        struct hfi1_qp_priv *priv = qp->priv;
5387        struct hfi1_ibport *ibp =
5388                to_iport(qp->ibqp.device, qp->port_num);
5389        struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
5390        struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
5391
5392        return iowait_tid_schedule(&priv->s_iowait, ppd->hfi1_wq,
5393                                   priv->s_sde ?
5394                                   priv->s_sde->cpu :
5395                                   cpumask_first(cpumask_of_node(dd->node)));
5396}
5397
5398/**
5399 * hfi1_schedule_tid_send - schedule progress on TID RDMA state machine
5400 * @qp: the QP
5401 *
5402 * This schedules qp progress on the TID RDMA state machine. Caller
5403 * should hold the s_lock.
5404 * Unlike hfi1_schedule_send(), this cannot use hfi1_send_ok() because
5405 * the two state machines can step on each other with respect to the
5406 * RVT_S_BUSY flag.
5407 * Therefore, a modified test is used.
5408 * @return true if the second leg is scheduled;
5409 *  false if the second leg is not scheduled.
5410 */
5411bool hfi1_schedule_tid_send(struct rvt_qp *qp)
5412{
5413        lockdep_assert_held(&qp->s_lock);
5414        if (hfi1_send_tid_ok(qp)) {
5415                /*
5416                 * The following call returns true if the qp is not on the
5417                 * queue and false if the qp is already on the queue before
5418                 * this call. Either way, the qp will be on the queue when the
5419                 * call returns.
5420                 */
5421                _hfi1_schedule_tid_send(qp);
5422                return true;
5423        }
5424        if (qp->s_flags & HFI1_S_ANY_WAIT_IO)
5425                iowait_set_flag(&((struct hfi1_qp_priv *)qp->priv)->s_iowait,
5426                                IOWAIT_PENDING_TID);
5427        return false;
5428}
5429
5430bool hfi1_tid_rdma_ack_interlock(struct rvt_qp *qp, struct rvt_ack_entry *e)
5431{
5432        struct rvt_ack_entry *prev;
5433        struct tid_rdma_request *req;
5434        struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
5435        struct hfi1_qp_priv *priv = qp->priv;
5436        u32 s_prev;
5437
5438        s_prev = qp->s_tail_ack_queue == 0 ? rvt_size_atomic(&dev->rdi) :
5439                (qp->s_tail_ack_queue - 1);
5440        prev = &qp->s_ack_queue[s_prev];
5441
5442        if ((e->opcode == TID_OP(READ_REQ) ||
5443             e->opcode == OP(RDMA_READ_REQUEST)) &&
5444            prev->opcode == TID_OP(WRITE_REQ)) {
5445                req = ack_to_tid_req(prev);
5446                if (req->ack_seg != req->total_segs) {
5447                        priv->s_flags |= HFI1_R_TID_WAIT_INTERLCK;
5448                        return true;
5449                }
5450        }
5451        return false;
5452}
5453
5454static u32 read_r_next_psn(struct hfi1_devdata *dd, u8 ctxt, u8 fidx)
5455{
5456        u64 reg;
5457
5458        /*
5459         * The only sane way to get the amount of
5460         * progress is to read the HW flow state.
5461         */
5462        reg = read_uctxt_csr(dd, ctxt, RCV_TID_FLOW_TABLE + (8 * fidx));
5463        return mask_psn(reg);
5464}
5465
5466static void tid_rdma_rcv_err(struct hfi1_packet *packet,
5467                             struct ib_other_headers *ohdr,
5468                             struct rvt_qp *qp, u32 psn, int diff, bool fecn)
5469{
5470        unsigned long flags;
5471
5472        tid_rdma_rcv_error(packet, ohdr, qp, psn, diff);
5473        if (fecn) {
5474                spin_lock_irqsave(&qp->s_lock, flags);
5475                qp->s_flags |= RVT_S_ECN;
5476                spin_unlock_irqrestore(&qp->s_lock, flags);
5477        }
5478}
5479
5480static void update_r_next_psn_fecn(struct hfi1_packet *packet,
5481                                   struct hfi1_qp_priv *priv,
5482                                   struct hfi1_ctxtdata *rcd,
5483                                   struct tid_rdma_flow *flow,
5484                                   bool fecn)
5485{
5486        /*
5487         * If a start/middle packet is delivered here due to
5488         * RSM rule and FECN, we need to update the r_next_psn.
5489         */
5490        if (fecn && packet->etype == RHF_RCV_TYPE_EAGER &&
5491            !(priv->s_flags & HFI1_R_TID_SW_PSN)) {
5492                struct hfi1_devdata *dd = rcd->dd;
5493
5494                flow->flow_state.r_next_psn =
5495                        read_r_next_psn(dd, rcd->ctxt, flow->idx);
5496        }
5497}
5498