linux/drivers/infiniband/sw/siw/siw_qp_rx.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
   2
   3/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
   4/* Copyright (c) 2008-2019, IBM Corporation */
   5
   6#include <linux/errno.h>
   7#include <linux/types.h>
   8#include <linux/net.h>
   9#include <linux/scatterlist.h>
  10#include <linux/highmem.h>
  11
  12#include <rdma/iw_cm.h>
  13#include <rdma/ib_verbs.h>
  14
  15#include "siw.h"
  16#include "siw_verbs.h"
  17#include "siw_mem.h"
  18
  19/*
  20 * siw_rx_umem()
  21 *
  22 * Receive data of @len into target referenced by @dest_addr.
  23 *
  24 * @srx:        Receive Context
  25 * @umem:       siw representation of target memory
  26 * @dest_addr:  user virtual address
  27 * @len:        number of bytes to place
  28 */
  29static int siw_rx_umem(struct siw_rx_stream *srx, struct siw_umem *umem,
  30                       u64 dest_addr, int len)
  31{
  32        int copied = 0;
  33
  34        while (len) {
  35                struct page *p;
  36                int pg_off, bytes, rv;
  37                void *dest;
  38
  39                p = siw_get_upage(umem, dest_addr);
  40                if (unlikely(!p)) {
  41                        pr_warn("siw: %s: [QP %u]: bogus addr: %pK, %pK\n",
  42                                __func__, qp_id(rx_qp(srx)),
  43                                (void *)(uintptr_t)dest_addr,
  44                                (void *)(uintptr_t)umem->fp_addr);
  45                        /* siw internal error */
  46                        srx->skb_copied += copied;
  47                        srx->skb_new -= copied;
  48
  49                        return -EFAULT;
  50                }
  51                pg_off = dest_addr & ~PAGE_MASK;
  52                bytes = min(len, (int)PAGE_SIZE - pg_off);
  53
  54                siw_dbg_qp(rx_qp(srx), "page %pK, bytes=%u\n", p, bytes);
  55
  56                dest = kmap_atomic(p);
  57                rv = skb_copy_bits(srx->skb, srx->skb_offset, dest + pg_off,
  58                                   bytes);
  59
  60                if (unlikely(rv)) {
  61                        kunmap_atomic(dest);
  62                        srx->skb_copied += copied;
  63                        srx->skb_new -= copied;
  64
  65                        pr_warn("siw: [QP %u]: %s, len %d, page %p, rv %d\n",
  66                                qp_id(rx_qp(srx)), __func__, len, p, rv);
  67
  68                        return -EFAULT;
  69                }
  70                if (srx->mpa_crc_hd) {
  71                        if (rx_qp(srx)->kernel_verbs) {
  72                                crypto_shash_update(srx->mpa_crc_hd,
  73                                        (u8 *)(dest + pg_off), bytes);
  74                                kunmap_atomic(dest);
  75                        } else {
  76                                kunmap_atomic(dest);
  77                                /*
  78                                 * Do CRC on original, not target buffer.
  79                                 * Some user land applications may
  80                                 * concurrently write the target buffer,
  81                                 * which would yield a broken CRC.
  82                                 * Walking the skb twice is very ineffcient.
  83                                 * Folding the CRC into skb_copy_bits()
  84                                 * would be much better, but is currently
  85                                 * not supported.
  86                                 */
  87                                siw_crc_skb(srx, bytes);
  88                        }
  89                } else {
  90                        kunmap_atomic(dest);
  91                }
  92                srx->skb_offset += bytes;
  93                copied += bytes;
  94                len -= bytes;
  95                dest_addr += bytes;
  96                pg_off = 0;
  97        }
  98        srx->skb_copied += copied;
  99        srx->skb_new -= copied;
 100
 101        return copied;
 102}
 103
 104static int siw_rx_kva(struct siw_rx_stream *srx, void *kva, int len)
 105{
 106        int rv;
 107
 108        siw_dbg_qp(rx_qp(srx), "kva: 0x%pK, len: %u\n", kva, len);
 109
 110        rv = skb_copy_bits(srx->skb, srx->skb_offset, kva, len);
 111        if (unlikely(rv)) {
 112                pr_warn("siw: [QP %u]: %s, len %d, kva 0x%pK, rv %d\n",
 113                        qp_id(rx_qp(srx)), __func__, len, kva, rv);
 114
 115                return rv;
 116        }
 117        if (srx->mpa_crc_hd)
 118                crypto_shash_update(srx->mpa_crc_hd, (u8 *)kva, len);
 119
 120        srx->skb_offset += len;
 121        srx->skb_copied += len;
 122        srx->skb_new -= len;
 123
 124        return len;
 125}
 126
 127static int siw_rx_pbl(struct siw_rx_stream *srx, int *pbl_idx,
 128                      struct siw_mem *mem, u64 addr, int len)
 129{
 130        struct siw_pbl *pbl = mem->pbl;
 131        u64 offset = addr - mem->va;
 132        int copied = 0;
 133
 134        while (len) {
 135                int bytes;
 136                dma_addr_t buf_addr =
 137                        siw_pbl_get_buffer(pbl, offset, &bytes, pbl_idx);
 138                if (!buf_addr)
 139                        break;
 140
 141                bytes = min(bytes, len);
 142                if (siw_rx_kva(srx, (void *)buf_addr, bytes) == bytes) {
 143                        copied += bytes;
 144                        offset += bytes;
 145                        len -= bytes;
 146                } else {
 147                        break;
 148                }
 149        }
 150        return copied;
 151}
 152
 153/*
 154 * siw_rresp_check_ntoh()
 155 *
 156 * Check incoming RRESP fragment header against expected
 157 * header values and update expected values for potential next
 158 * fragment.
 159 *
 160 * NOTE: This function must be called only if a RRESP DDP segment
 161 *       starts but not for fragmented consecutive pieces of an
 162 *       already started DDP segment.
 163 */
 164static int siw_rresp_check_ntoh(struct siw_rx_stream *srx,
 165                                struct siw_rx_fpdu *frx)
 166{
 167        struct iwarp_rdma_rresp *rresp = &srx->hdr.rresp;
 168        struct siw_wqe *wqe = &frx->wqe_active;
 169        enum ddp_ecode ecode;
 170
 171        u32 sink_stag = be32_to_cpu(rresp->sink_stag);
 172        u64 sink_to = be64_to_cpu(rresp->sink_to);
 173
 174        if (frx->first_ddp_seg) {
 175                srx->ddp_stag = wqe->sqe.sge[0].lkey;
 176                srx->ddp_to = wqe->sqe.sge[0].laddr;
 177                frx->pbl_idx = 0;
 178        }
 179        /* Below checks extend beyond the semantics of DDP, and
 180         * into RDMAP:
 181         * We check if the read response matches exactly the
 182         * read request which was send to the remote peer to
 183         * trigger this read response. RFC5040/5041 do not
 184         * always have a proper error code for the detected
 185         * error cases. We choose 'base or bounds error' for
 186         * cases where the inbound STag is valid, but offset
 187         * or length do not match our response receive state.
 188         */
 189        if (unlikely(srx->ddp_stag != sink_stag)) {
 190                pr_warn("siw: [QP %u]: rresp stag: %08x != %08x\n",
 191                        qp_id(rx_qp(srx)), sink_stag, srx->ddp_stag);
 192                ecode = DDP_ECODE_T_INVALID_STAG;
 193                goto error;
 194        }
 195        if (unlikely(srx->ddp_to != sink_to)) {
 196                pr_warn("siw: [QP %u]: rresp off: %016llx != %016llx\n",
 197                        qp_id(rx_qp(srx)), (unsigned long long)sink_to,
 198                        (unsigned long long)srx->ddp_to);
 199                ecode = DDP_ECODE_T_BASE_BOUNDS;
 200                goto error;
 201        }
 202        if (unlikely(!frx->more_ddp_segs &&
 203                     (wqe->processed + srx->fpdu_part_rem != wqe->bytes))) {
 204                pr_warn("siw: [QP %u]: rresp len: %d != %d\n",
 205                        qp_id(rx_qp(srx)),
 206                        wqe->processed + srx->fpdu_part_rem, wqe->bytes);
 207                ecode = DDP_ECODE_T_BASE_BOUNDS;
 208                goto error;
 209        }
 210        return 0;
 211error:
 212        siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
 213                           DDP_ETYPE_TAGGED_BUF, ecode, 0);
 214        return -EINVAL;
 215}
 216
 217/*
 218 * siw_write_check_ntoh()
 219 *
 220 * Check incoming WRITE fragment header against expected
 221 * header values and update expected values for potential next
 222 * fragment
 223 *
 224 * NOTE: This function must be called only if a WRITE DDP segment
 225 *       starts but not for fragmented consecutive pieces of an
 226 *       already started DDP segment.
 227 */
 228static int siw_write_check_ntoh(struct siw_rx_stream *srx,
 229                                struct siw_rx_fpdu *frx)
 230{
 231        struct iwarp_rdma_write *write = &srx->hdr.rwrite;
 232        enum ddp_ecode ecode;
 233
 234        u32 sink_stag = be32_to_cpu(write->sink_stag);
 235        u64 sink_to = be64_to_cpu(write->sink_to);
 236
 237        if (frx->first_ddp_seg) {
 238                srx->ddp_stag = sink_stag;
 239                srx->ddp_to = sink_to;
 240                frx->pbl_idx = 0;
 241        } else {
 242                if (unlikely(srx->ddp_stag != sink_stag)) {
 243                        pr_warn("siw: [QP %u]: write stag: %08x != %08x\n",
 244                                qp_id(rx_qp(srx)), sink_stag,
 245                                srx->ddp_stag);
 246                        ecode = DDP_ECODE_T_INVALID_STAG;
 247                        goto error;
 248                }
 249                if (unlikely(srx->ddp_to != sink_to)) {
 250                        pr_warn("siw: [QP %u]: write off: %016llx != %016llx\n",
 251                                qp_id(rx_qp(srx)),
 252                                (unsigned long long)sink_to,
 253                                (unsigned long long)srx->ddp_to);
 254                        ecode = DDP_ECODE_T_BASE_BOUNDS;
 255                        goto error;
 256                }
 257        }
 258        return 0;
 259error:
 260        siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
 261                           DDP_ETYPE_TAGGED_BUF, ecode, 0);
 262        return -EINVAL;
 263}
 264
 265/*
 266 * siw_send_check_ntoh()
 267 *
 268 * Check incoming SEND fragment header against expected
 269 * header values and update expected MSN if no next
 270 * fragment expected
 271 *
 272 * NOTE: This function must be called only if a SEND DDP segment
 273 *       starts but not for fragmented consecutive pieces of an
 274 *       already started DDP segment.
 275 */
 276static int siw_send_check_ntoh(struct siw_rx_stream *srx,
 277                               struct siw_rx_fpdu *frx)
 278{
 279        struct iwarp_send_inv *send = &srx->hdr.send_inv;
 280        struct siw_wqe *wqe = &frx->wqe_active;
 281        enum ddp_ecode ecode;
 282
 283        u32 ddp_msn = be32_to_cpu(send->ddp_msn);
 284        u32 ddp_mo = be32_to_cpu(send->ddp_mo);
 285        u32 ddp_qn = be32_to_cpu(send->ddp_qn);
 286
 287        if (unlikely(ddp_qn != RDMAP_UNTAGGED_QN_SEND)) {
 288                pr_warn("siw: [QP %u]: invalid ddp qn %d for send\n",
 289                        qp_id(rx_qp(srx)), ddp_qn);
 290                ecode = DDP_ECODE_UT_INVALID_QN;
 291                goto error;
 292        }
 293        if (unlikely(ddp_msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND])) {
 294                pr_warn("siw: [QP %u]: send msn: %u != %u\n",
 295                        qp_id(rx_qp(srx)), ddp_msn,
 296                        srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]);
 297                ecode = DDP_ECODE_UT_INVALID_MSN_RANGE;
 298                goto error;
 299        }
 300        if (unlikely(ddp_mo != wqe->processed)) {
 301                pr_warn("siw: [QP %u], send mo: %u != %u\n",
 302                        qp_id(rx_qp(srx)), ddp_mo, wqe->processed);
 303                ecode = DDP_ECODE_UT_INVALID_MO;
 304                goto error;
 305        }
 306        if (frx->first_ddp_seg) {
 307                /* initialize user memory write position */
 308                frx->sge_idx = 0;
 309                frx->sge_off = 0;
 310                frx->pbl_idx = 0;
 311
 312                /* only valid for SEND_INV and SEND_SE_INV operations */
 313                srx->inval_stag = be32_to_cpu(send->inval_stag);
 314        }
 315        if (unlikely(wqe->bytes < wqe->processed + srx->fpdu_part_rem)) {
 316                siw_dbg_qp(rx_qp(srx), "receive space short: %d - %d < %d\n",
 317                           wqe->bytes, wqe->processed, srx->fpdu_part_rem);
 318                wqe->wc_status = SIW_WC_LOC_LEN_ERR;
 319                ecode = DDP_ECODE_UT_INVALID_MSN_NOBUF;
 320                goto error;
 321        }
 322        return 0;
 323error:
 324        siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
 325                           DDP_ETYPE_UNTAGGED_BUF, ecode, 0);
 326        return -EINVAL;
 327}
 328
 329static struct siw_wqe *siw_rqe_get(struct siw_qp *qp)
 330{
 331        struct siw_rqe *rqe;
 332        struct siw_srq *srq;
 333        struct siw_wqe *wqe = NULL;
 334        bool srq_event = false;
 335        unsigned long flags;
 336
 337        srq = qp->srq;
 338        if (srq) {
 339                spin_lock_irqsave(&srq->lock, flags);
 340                if (unlikely(!srq->num_rqe))
 341                        goto out;
 342
 343                rqe = &srq->recvq[srq->rq_get % srq->num_rqe];
 344        } else {
 345                if (unlikely(!qp->recvq))
 346                        goto out;
 347
 348                rqe = &qp->recvq[qp->rq_get % qp->attrs.rq_size];
 349        }
 350        if (likely(rqe->flags == SIW_WQE_VALID)) {
 351                int num_sge = rqe->num_sge;
 352
 353                if (likely(num_sge <= SIW_MAX_SGE)) {
 354                        int i = 0;
 355
 356                        wqe = rx_wqe(&qp->rx_untagged);
 357                        rx_type(wqe) = SIW_OP_RECEIVE;
 358                        wqe->wr_status = SIW_WR_INPROGRESS;
 359                        wqe->bytes = 0;
 360                        wqe->processed = 0;
 361
 362                        wqe->rqe.id = rqe->id;
 363                        wqe->rqe.num_sge = num_sge;
 364
 365                        while (i < num_sge) {
 366                                wqe->rqe.sge[i].laddr = rqe->sge[i].laddr;
 367                                wqe->rqe.sge[i].lkey = rqe->sge[i].lkey;
 368                                wqe->rqe.sge[i].length = rqe->sge[i].length;
 369                                wqe->bytes += wqe->rqe.sge[i].length;
 370                                wqe->mem[i] = NULL;
 371                                i++;
 372                        }
 373                        /* can be re-used by appl */
 374                        smp_store_mb(rqe->flags, 0);
 375                } else {
 376                        siw_dbg_qp(qp, "too many sge's: %d\n", rqe->num_sge);
 377                        if (srq)
 378                                spin_unlock_irqrestore(&srq->lock, flags);
 379                        return NULL;
 380                }
 381                if (!srq) {
 382                        qp->rq_get++;
 383                } else {
 384                        if (srq->armed) {
 385                                /* Test SRQ limit */
 386                                u32 off = (srq->rq_get + srq->limit) %
 387                                          srq->num_rqe;
 388                                struct siw_rqe *rqe2 = &srq->recvq[off];
 389
 390                                if (!(rqe2->flags & SIW_WQE_VALID)) {
 391                                        srq->armed = 0;
 392                                        srq_event = true;
 393                                }
 394                        }
 395                        srq->rq_get++;
 396                }
 397        }
 398out:
 399        if (srq) {
 400                spin_unlock_irqrestore(&srq->lock, flags);
 401                if (srq_event)
 402                        siw_srq_event(srq, IB_EVENT_SRQ_LIMIT_REACHED);
 403        }
 404        return wqe;
 405}
 406
 407/*
 408 * siw_proc_send:
 409 *
 410 * Process one incoming SEND and place data into memory referenced by
 411 * receive wqe.
 412 *
 413 * Function supports partially received sends (suspending/resuming
 414 * current receive wqe processing)
 415 *
 416 * return value:
 417 *      0:       reached the end of a DDP segment
 418 *      -EAGAIN: to be called again to finish the DDP segment
 419 */
 420int siw_proc_send(struct siw_qp *qp)
 421{
 422        struct siw_rx_stream *srx = &qp->rx_stream;
 423        struct siw_rx_fpdu *frx = &qp->rx_untagged;
 424        struct siw_wqe *wqe;
 425        u32 data_bytes; /* all data bytes available */
 426        u32 rcvd_bytes; /* sum of data bytes rcvd */
 427        int rv = 0;
 428
 429        if (frx->first_ddp_seg) {
 430                wqe = siw_rqe_get(qp);
 431                if (unlikely(!wqe)) {
 432                        siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
 433                                           DDP_ETYPE_UNTAGGED_BUF,
 434                                           DDP_ECODE_UT_INVALID_MSN_NOBUF, 0);
 435                        return -ENOENT;
 436                }
 437        } else {
 438                wqe = rx_wqe(frx);
 439        }
 440        if (srx->state == SIW_GET_DATA_START) {
 441                rv = siw_send_check_ntoh(srx, frx);
 442                if (unlikely(rv)) {
 443                        siw_qp_event(qp, IB_EVENT_QP_FATAL);
 444                        return rv;
 445                }
 446                if (!srx->fpdu_part_rem) /* zero length SEND */
 447                        return 0;
 448        }
 449        data_bytes = min(srx->fpdu_part_rem, srx->skb_new);
 450        rcvd_bytes = 0;
 451
 452        /* A zero length SEND will skip below loop */
 453        while (data_bytes) {
 454                struct ib_pd *pd;
 455                struct siw_mem **mem, *mem_p;
 456                struct siw_sge *sge;
 457                u32 sge_bytes; /* data bytes avail for SGE */
 458
 459                sge = &wqe->rqe.sge[frx->sge_idx];
 460
 461                if (!sge->length) {
 462                        /* just skip empty sge's */
 463                        frx->sge_idx++;
 464                        frx->sge_off = 0;
 465                        frx->pbl_idx = 0;
 466                        continue;
 467                }
 468                sge_bytes = min(data_bytes, sge->length - frx->sge_off);
 469                mem = &wqe->mem[frx->sge_idx];
 470
 471                /*
 472                 * check with QP's PD if no SRQ present, SRQ's PD otherwise
 473                 */
 474                pd = qp->srq == NULL ? qp->pd : qp->srq->base_srq.pd;
 475
 476                rv = siw_check_sge(pd, sge, mem, IB_ACCESS_LOCAL_WRITE,
 477                                   frx->sge_off, sge_bytes);
 478                if (unlikely(rv)) {
 479                        siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
 480                                           DDP_ETYPE_CATASTROPHIC,
 481                                           DDP_ECODE_CATASTROPHIC, 0);
 482
 483                        siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
 484                        break;
 485                }
 486                mem_p = *mem;
 487                if (mem_p->mem_obj == NULL)
 488                        rv = siw_rx_kva(srx,
 489                                (void *)(uintptr_t)(sge->laddr + frx->sge_off),
 490                                sge_bytes);
 491                else if (!mem_p->is_pbl)
 492                        rv = siw_rx_umem(srx, mem_p->umem,
 493                                         sge->laddr + frx->sge_off, sge_bytes);
 494                else
 495                        rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p,
 496                                        sge->laddr + frx->sge_off, sge_bytes);
 497
 498                if (unlikely(rv != sge_bytes)) {
 499                        wqe->processed += rcvd_bytes;
 500
 501                        siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
 502                                           DDP_ETYPE_CATASTROPHIC,
 503                                           DDP_ECODE_CATASTROPHIC, 0);
 504                        return -EINVAL;
 505                }
 506                frx->sge_off += rv;
 507
 508                if (frx->sge_off == sge->length) {
 509                        frx->sge_idx++;
 510                        frx->sge_off = 0;
 511                        frx->pbl_idx = 0;
 512                }
 513                data_bytes -= rv;
 514                rcvd_bytes += rv;
 515
 516                srx->fpdu_part_rem -= rv;
 517                srx->fpdu_part_rcvd += rv;
 518        }
 519        wqe->processed += rcvd_bytes;
 520
 521        if (!srx->fpdu_part_rem)
 522                return 0;
 523
 524        return (rv < 0) ? rv : -EAGAIN;
 525}
 526
 527/*
 528 * siw_proc_write:
 529 *
 530 * Place incoming WRITE after referencing and checking target buffer
 531
 532 * Function supports partially received WRITEs (suspending/resuming
 533 * current receive processing)
 534 *
 535 * return value:
 536 *      0:       reached the end of a DDP segment
 537 *      -EAGAIN: to be called again to finish the DDP segment
 538 */
 539int siw_proc_write(struct siw_qp *qp)
 540{
 541        struct siw_rx_stream *srx = &qp->rx_stream;
 542        struct siw_rx_fpdu *frx = &qp->rx_tagged;
 543        struct siw_mem *mem;
 544        int bytes, rv;
 545
 546        if (srx->state == SIW_GET_DATA_START) {
 547                if (!srx->fpdu_part_rem) /* zero length WRITE */
 548                        return 0;
 549
 550                rv = siw_write_check_ntoh(srx, frx);
 551                if (unlikely(rv)) {
 552                        siw_qp_event(qp, IB_EVENT_QP_FATAL);
 553                        return rv;
 554                }
 555        }
 556        bytes = min(srx->fpdu_part_rem, srx->skb_new);
 557
 558        if (frx->first_ddp_seg) {
 559                struct siw_wqe *wqe = rx_wqe(frx);
 560
 561                rx_mem(frx) = siw_mem_id2obj(qp->sdev, srx->ddp_stag >> 8);
 562                if (unlikely(!rx_mem(frx))) {
 563                        siw_dbg_qp(qp,
 564                                   "sink stag not found/invalid, stag 0x%08x\n",
 565                                   srx->ddp_stag);
 566
 567                        siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
 568                                           DDP_ETYPE_TAGGED_BUF,
 569                                           DDP_ECODE_T_INVALID_STAG, 0);
 570                        return -EINVAL;
 571                }
 572                wqe->rqe.num_sge = 1;
 573                rx_type(wqe) = SIW_OP_WRITE;
 574                wqe->wr_status = SIW_WR_INPROGRESS;
 575        }
 576        mem = rx_mem(frx);
 577
 578        /*
 579         * Check if application re-registered memory with different
 580         * key field of STag.
 581         */
 582        if (unlikely(mem->stag != srx->ddp_stag)) {
 583                siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
 584                                   DDP_ETYPE_TAGGED_BUF,
 585                                   DDP_ECODE_T_INVALID_STAG, 0);
 586                return -EINVAL;
 587        }
 588        rv = siw_check_mem(qp->pd, mem, srx->ddp_to + srx->fpdu_part_rcvd,
 589                           IB_ACCESS_REMOTE_WRITE, bytes);
 590        if (unlikely(rv)) {
 591                siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
 592                                   DDP_ETYPE_TAGGED_BUF, siw_tagged_error(-rv),
 593                                   0);
 594
 595                siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
 596
 597                return -EINVAL;
 598        }
 599
 600        if (mem->mem_obj == NULL)
 601                rv = siw_rx_kva(srx,
 602                        (void *)(uintptr_t)(srx->ddp_to + srx->fpdu_part_rcvd),
 603                        bytes);
 604        else if (!mem->is_pbl)
 605                rv = siw_rx_umem(srx, mem->umem,
 606                                 srx->ddp_to + srx->fpdu_part_rcvd, bytes);
 607        else
 608                rv = siw_rx_pbl(srx, &frx->pbl_idx, mem,
 609                                srx->ddp_to + srx->fpdu_part_rcvd, bytes);
 610
 611        if (unlikely(rv != bytes)) {
 612                siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
 613                                   DDP_ETYPE_CATASTROPHIC,
 614                                   DDP_ECODE_CATASTROPHIC, 0);
 615                return -EINVAL;
 616        }
 617        srx->fpdu_part_rem -= rv;
 618        srx->fpdu_part_rcvd += rv;
 619
 620        if (!srx->fpdu_part_rem) {
 621                srx->ddp_to += srx->fpdu_part_rcvd;
 622                return 0;
 623        }
 624        return -EAGAIN;
 625}
 626
 627/*
 628 * Inbound RREQ's cannot carry user data.
 629 */
 630int siw_proc_rreq(struct siw_qp *qp)
 631{
 632        struct siw_rx_stream *srx = &qp->rx_stream;
 633
 634        if (!srx->fpdu_part_rem)
 635                return 0;
 636
 637        pr_warn("siw: [QP %u]: rreq with mpa len %d\n", qp_id(qp),
 638                be16_to_cpu(srx->hdr.ctrl.mpa_len));
 639
 640        return -EPROTO;
 641}
 642
 643/*
 644 * siw_init_rresp:
 645 *
 646 * Process inbound RDMA READ REQ. Produce a pseudo READ RESPONSE WQE.
 647 * Put it at the tail of the IRQ, if there is another WQE currently in
 648 * transmit processing. If not, make it the current WQE to be processed
 649 * and schedule transmit processing.
 650 *
 651 * Can be called from softirq context and from process
 652 * context (RREAD socket loopback case!)
 653 *
 654 * return value:
 655 *      0:      success,
 656 *              failure code otherwise
 657 */
 658
 659static int siw_init_rresp(struct siw_qp *qp, struct siw_rx_stream *srx)
 660{
 661        struct siw_wqe *tx_work = tx_wqe(qp);
 662        struct siw_sqe *resp;
 663
 664        uint64_t raddr = be64_to_cpu(srx->hdr.rreq.sink_to),
 665                 laddr = be64_to_cpu(srx->hdr.rreq.source_to);
 666        uint32_t length = be32_to_cpu(srx->hdr.rreq.read_size),
 667                 lkey = be32_to_cpu(srx->hdr.rreq.source_stag),
 668                 rkey = be32_to_cpu(srx->hdr.rreq.sink_stag),
 669                 msn = be32_to_cpu(srx->hdr.rreq.ddp_msn);
 670
 671        int run_sq = 1, rv = 0;
 672        unsigned long flags;
 673
 674        if (unlikely(msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ])) {
 675                siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
 676                                   DDP_ETYPE_UNTAGGED_BUF,
 677                                   DDP_ECODE_UT_INVALID_MSN_RANGE, 0);
 678                return -EPROTO;
 679        }
 680        spin_lock_irqsave(&qp->sq_lock, flags);
 681
 682        if (tx_work->wr_status == SIW_WR_IDLE) {
 683                /*
 684                 * immediately schedule READ response w/o
 685                 * consuming IRQ entry: IRQ must be empty.
 686                 */
 687                tx_work->processed = 0;
 688                tx_work->mem[0] = NULL;
 689                tx_work->wr_status = SIW_WR_QUEUED;
 690                resp = &tx_work->sqe;
 691        } else {
 692                resp = irq_alloc_free(qp);
 693                run_sq = 0;
 694        }
 695        if (likely(resp)) {
 696                resp->opcode = SIW_OP_READ_RESPONSE;
 697
 698                resp->sge[0].length = length;
 699                resp->sge[0].laddr = laddr;
 700                resp->sge[0].lkey = lkey;
 701
 702                /* Keep aside message sequence number for potential
 703                 * error reporting during Read Response generation.
 704                 */
 705                resp->sge[1].length = msn;
 706
 707                resp->raddr = raddr;
 708                resp->rkey = rkey;
 709                resp->num_sge = length ? 1 : 0;
 710
 711                /* RRESP now valid as current TX wqe or placed into IRQ */
 712                smp_store_mb(resp->flags, SIW_WQE_VALID);
 713        } else {
 714                pr_warn("siw: [QP %u]: irq %d exceeded %d\n", qp_id(qp),
 715                        qp->irq_put % qp->attrs.irq_size, qp->attrs.irq_size);
 716
 717                siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
 718                                   RDMAP_ETYPE_REMOTE_OPERATION,
 719                                   RDMAP_ECODE_CATASTROPHIC_STREAM, 0);
 720                rv = -EPROTO;
 721        }
 722
 723        spin_unlock_irqrestore(&qp->sq_lock, flags);
 724
 725        if (run_sq)
 726                rv = siw_sq_start(qp);
 727
 728        return rv;
 729}
 730
 731/*
 732 * Only called at start of Read.Resonse processing.
 733 * Transfer pending Read from tip of ORQ into currrent rx wqe,
 734 * but keep ORQ entry valid until Read.Response processing done.
 735 * No Queue locking needed.
 736 */
 737static int siw_orqe_start_rx(struct siw_qp *qp)
 738{
 739        struct siw_sqe *orqe;
 740        struct siw_wqe *wqe = NULL;
 741
 742        /* make sure ORQ indices are current */
 743        smp_mb();
 744
 745        orqe = orq_get_current(qp);
 746        if (READ_ONCE(orqe->flags) & SIW_WQE_VALID) {
 747                /* RRESP is a TAGGED RDMAP operation */
 748                wqe = rx_wqe(&qp->rx_tagged);
 749                wqe->sqe.id = orqe->id;
 750                wqe->sqe.opcode = orqe->opcode;
 751                wqe->sqe.sge[0].laddr = orqe->sge[0].laddr;
 752                wqe->sqe.sge[0].lkey = orqe->sge[0].lkey;
 753                wqe->sqe.sge[0].length = orqe->sge[0].length;
 754                wqe->sqe.flags = orqe->flags;
 755                wqe->sqe.num_sge = 1;
 756                wqe->bytes = orqe->sge[0].length;
 757                wqe->processed = 0;
 758                wqe->mem[0] = NULL;
 759                /* make sure WQE is completely written before valid */
 760                smp_wmb();
 761                wqe->wr_status = SIW_WR_INPROGRESS;
 762
 763                return 0;
 764        }
 765        return -EPROTO;
 766}
 767
 768/*
 769 * siw_proc_rresp:
 770 *
 771 * Place incoming RRESP data into memory referenced by RREQ WQE
 772 * which is at the tip of the ORQ
 773 *
 774 * Function supports partially received RRESP's (suspending/resuming
 775 * current receive processing)
 776 */
 777int siw_proc_rresp(struct siw_qp *qp)
 778{
 779        struct siw_rx_stream *srx = &qp->rx_stream;
 780        struct siw_rx_fpdu *frx = &qp->rx_tagged;
 781        struct siw_wqe *wqe = rx_wqe(frx);
 782        struct siw_mem **mem, *mem_p;
 783        struct siw_sge *sge;
 784        int bytes, rv;
 785
 786        if (frx->first_ddp_seg) {
 787                if (unlikely(wqe->wr_status != SIW_WR_IDLE)) {
 788                        pr_warn("siw: [QP %u]: proc RRESP: status %d, op %d\n",
 789                                qp_id(qp), wqe->wr_status, wqe->sqe.opcode);
 790                        rv = -EPROTO;
 791                        goto error_term;
 792                }
 793                /*
 794                 * fetch pending RREQ from orq
 795                 */
 796                rv = siw_orqe_start_rx(qp);
 797                if (rv) {
 798                        pr_warn("siw: [QP %u]: ORQ empty at idx %d\n",
 799                                qp_id(qp), qp->orq_get % qp->attrs.orq_size);
 800                        goto error_term;
 801                }
 802                rv = siw_rresp_check_ntoh(srx, frx);
 803                if (unlikely(rv)) {
 804                        siw_qp_event(qp, IB_EVENT_QP_FATAL);
 805                        return rv;
 806                }
 807        } else {
 808                if (unlikely(wqe->wr_status != SIW_WR_INPROGRESS)) {
 809                        pr_warn("siw: [QP %u]: resume RRESP: status %d\n",
 810                                qp_id(qp), wqe->wr_status);
 811                        rv = -EPROTO;
 812                        goto error_term;
 813                }
 814        }
 815        if (!srx->fpdu_part_rem) /* zero length RRESPONSE */
 816                return 0;
 817
 818        sge = wqe->sqe.sge; /* there is only one */
 819        mem = &wqe->mem[0];
 820
 821        if (!(*mem)) {
 822                /*
 823                 * check target memory which resolves memory on first fragment
 824                 */
 825                rv = siw_check_sge(qp->pd, sge, mem, IB_ACCESS_LOCAL_WRITE, 0,
 826                                   wqe->bytes);
 827                if (unlikely(rv)) {
 828                        siw_dbg_qp(qp, "target mem check: %d\n", rv);
 829                        wqe->wc_status = SIW_WC_LOC_PROT_ERR;
 830
 831                        siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
 832                                           DDP_ETYPE_TAGGED_BUF,
 833                                           siw_tagged_error(-rv), 0);
 834
 835                        siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
 836
 837                        return -EINVAL;
 838                }
 839        }
 840        mem_p = *mem;
 841
 842        bytes = min(srx->fpdu_part_rem, srx->skb_new);
 843
 844        if (mem_p->mem_obj == NULL)
 845                rv = siw_rx_kva(srx,
 846                        (void *)(uintptr_t)(sge->laddr + wqe->processed),
 847                        bytes);
 848        else if (!mem_p->is_pbl)
 849                rv = siw_rx_umem(srx, mem_p->umem, sge->laddr + wqe->processed,
 850                                 bytes);
 851        else
 852                rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p,
 853                                sge->laddr + wqe->processed, bytes);
 854        if (rv != bytes) {
 855                wqe->wc_status = SIW_WC_GENERAL_ERR;
 856                rv = -EINVAL;
 857                goto error_term;
 858        }
 859        srx->fpdu_part_rem -= rv;
 860        srx->fpdu_part_rcvd += rv;
 861        wqe->processed += rv;
 862
 863        if (!srx->fpdu_part_rem) {
 864                srx->ddp_to += srx->fpdu_part_rcvd;
 865                return 0;
 866        }
 867        return -EAGAIN;
 868
 869error_term:
 870        siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, DDP_ETYPE_CATASTROPHIC,
 871                           DDP_ECODE_CATASTROPHIC, 0);
 872        return rv;
 873}
 874
 875int siw_proc_terminate(struct siw_qp *qp)
 876{
 877        struct siw_rx_stream *srx = &qp->rx_stream;
 878        struct sk_buff *skb = srx->skb;
 879        struct iwarp_terminate *term = &srx->hdr.terminate;
 880        union iwarp_hdr term_info;
 881        u8 *infop = (u8 *)&term_info;
 882        enum rdma_opcode op;
 883        u16 to_copy = sizeof(struct iwarp_ctrl);
 884
 885        pr_warn("siw: got TERMINATE. layer %d, type %d, code %d\n",
 886                __rdmap_term_layer(term), __rdmap_term_etype(term),
 887                __rdmap_term_ecode(term));
 888
 889        if (be32_to_cpu(term->ddp_qn) != RDMAP_UNTAGGED_QN_TERMINATE ||
 890            be32_to_cpu(term->ddp_msn) !=
 891                    qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] ||
 892            be32_to_cpu(term->ddp_mo) != 0) {
 893                pr_warn("siw: rx bogus TERM [QN x%08x, MSN x%08x, MO x%08x]\n",
 894                        be32_to_cpu(term->ddp_qn), be32_to_cpu(term->ddp_msn),
 895                        be32_to_cpu(term->ddp_mo));
 896                return -ECONNRESET;
 897        }
 898        /*
 899         * Receive remaining pieces of TERM if indicated
 900         */
 901        if (!term->flag_m)
 902                return -ECONNRESET;
 903
 904        /* Do not take the effort to reassemble a network fragmented
 905         * TERM message
 906         */
 907        if (srx->skb_new < sizeof(struct iwarp_ctrl_tagged))
 908                return -ECONNRESET;
 909
 910        memset(infop, 0, sizeof(term_info));
 911
 912        skb_copy_bits(skb, srx->skb_offset, infop, to_copy);
 913
 914        op = __rdmap_get_opcode(&term_info.ctrl);
 915        if (op >= RDMAP_TERMINATE)
 916                goto out;
 917
 918        infop += to_copy;
 919        srx->skb_offset += to_copy;
 920        srx->skb_new -= to_copy;
 921        srx->skb_copied += to_copy;
 922        srx->fpdu_part_rcvd += to_copy;
 923        srx->fpdu_part_rem -= to_copy;
 924
 925        to_copy = iwarp_pktinfo[op].hdr_len - to_copy;
 926
 927        /* Again, no network fragmented TERM's */
 928        if (to_copy + MPA_CRC_SIZE > srx->skb_new)
 929                return -ECONNRESET;
 930
 931        skb_copy_bits(skb, srx->skb_offset, infop, to_copy);
 932
 933        if (term->flag_r) {
 934                siw_dbg_qp(qp, "TERM reports RDMAP hdr type %u, len %u (%s)\n",
 935                           op, be16_to_cpu(term_info.ctrl.mpa_len),
 936                           term->flag_m ? "valid" : "invalid");
 937        } else if (term->flag_d) {
 938                siw_dbg_qp(qp, "TERM reports DDP hdr type %u, len %u (%s)\n",
 939                           op, be16_to_cpu(term_info.ctrl.mpa_len),
 940                           term->flag_m ? "valid" : "invalid");
 941        }
 942out:
 943        srx->skb_new -= to_copy;
 944        srx->skb_offset += to_copy;
 945        srx->skb_copied += to_copy;
 946        srx->fpdu_part_rcvd += to_copy;
 947        srx->fpdu_part_rem -= to_copy;
 948
 949        return -ECONNRESET;
 950}
 951
 952static int siw_get_trailer(struct siw_qp *qp, struct siw_rx_stream *srx)
 953{
 954        struct sk_buff *skb = srx->skb;
 955        u8 *tbuf = (u8 *)&srx->trailer.crc - srx->pad;
 956        __wsum crc_in, crc_own = 0;
 957
 958        siw_dbg_qp(qp, "expected %d, available %d, pad %u\n",
 959                   srx->fpdu_part_rem, srx->skb_new, srx->pad);
 960
 961        if (srx->skb_new < srx->fpdu_part_rem)
 962                return -EAGAIN;
 963
 964        skb_copy_bits(skb, srx->skb_offset, tbuf, srx->fpdu_part_rem);
 965
 966        if (srx->mpa_crc_hd && srx->pad)
 967                crypto_shash_update(srx->mpa_crc_hd, tbuf, srx->pad);
 968
 969        srx->skb_new -= srx->fpdu_part_rem;
 970        srx->skb_offset += srx->fpdu_part_rem;
 971        srx->skb_copied += srx->fpdu_part_rem;
 972
 973        if (!srx->mpa_crc_hd)
 974                return 0;
 975
 976        /*
 977         * CRC32 is computed, transmitted and received directly in NBO,
 978         * so there's never a reason to convert byte order.
 979         */
 980        crypto_shash_final(srx->mpa_crc_hd, (u8 *)&crc_own);
 981        crc_in = (__force __wsum)srx->trailer.crc;
 982
 983        if (unlikely(crc_in != crc_own)) {
 984                pr_warn("siw: crc error. in: %08x, own %08x, op %u\n",
 985                        crc_in, crc_own, qp->rx_stream.rdmap_op);
 986
 987                siw_init_terminate(qp, TERM_ERROR_LAYER_LLP,
 988                                   LLP_ETYPE_MPA,
 989                                   LLP_ECODE_RECEIVED_CRC, 0);
 990                return -EINVAL;
 991        }
 992        return 0;
 993}
 994
 995#define MIN_DDP_HDR sizeof(struct iwarp_ctrl_tagged)
 996
 997static int siw_get_hdr(struct siw_rx_stream *srx)
 998{
 999        struct sk_buff *skb = srx->skb;
1000        struct siw_qp *qp = rx_qp(srx);
1001        struct iwarp_ctrl *c_hdr = &srx->hdr.ctrl;
1002        struct siw_rx_fpdu *frx;
1003        u8 opcode;
1004        int bytes;
1005
1006        if (srx->fpdu_part_rcvd < MIN_DDP_HDR) {
1007                /*
1008                 * copy a mimimum sized (tagged) DDP frame control part
1009                 */
1010                bytes = min_t(int, srx->skb_new,
1011                              MIN_DDP_HDR - srx->fpdu_part_rcvd);
1012
1013                skb_copy_bits(skb, srx->skb_offset,
1014                              (char *)c_hdr + srx->fpdu_part_rcvd, bytes);
1015
1016                srx->fpdu_part_rcvd += bytes;
1017
1018                srx->skb_new -= bytes;
1019                srx->skb_offset += bytes;
1020                srx->skb_copied += bytes;
1021
1022                if (srx->fpdu_part_rcvd < MIN_DDP_HDR)
1023                        return -EAGAIN;
1024
1025                if (unlikely(__ddp_get_version(c_hdr) != DDP_VERSION)) {
1026                        enum ddp_etype etype;
1027                        enum ddp_ecode ecode;
1028
1029                        pr_warn("siw: received ddp version unsupported %d\n",
1030                                __ddp_get_version(c_hdr));
1031
1032                        if (c_hdr->ddp_rdmap_ctrl & DDP_FLAG_TAGGED) {
1033                                etype = DDP_ETYPE_TAGGED_BUF;
1034                                ecode = DDP_ECODE_T_VERSION;
1035                        } else {
1036                                etype = DDP_ETYPE_UNTAGGED_BUF;
1037                                ecode = DDP_ECODE_UT_VERSION;
1038                        }
1039                        siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
1040                                           etype, ecode, 0);
1041                        return -EINVAL;
1042                }
1043                if (unlikely(__rdmap_get_version(c_hdr) != RDMAP_VERSION)) {
1044                        pr_warn("siw: received rdmap version unsupported %d\n",
1045                                __rdmap_get_version(c_hdr));
1046
1047                        siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP,
1048                                           RDMAP_ETYPE_REMOTE_OPERATION,
1049                                           RDMAP_ECODE_VERSION, 0);
1050                        return -EINVAL;
1051                }
1052                opcode = __rdmap_get_opcode(c_hdr);
1053
1054                if (opcode > RDMAP_TERMINATE) {
1055                        pr_warn("siw: received unknown packet type %u\n",
1056                                opcode);
1057
1058                        siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP,
1059                                           RDMAP_ETYPE_REMOTE_OPERATION,
1060                                           RDMAP_ECODE_OPCODE, 0);
1061                        return -EINVAL;
1062                }
1063                siw_dbg_qp(rx_qp(srx), "new header, opcode %u\n", opcode);
1064        } else {
1065                opcode = __rdmap_get_opcode(c_hdr);
1066        }
1067        set_rx_fpdu_context(qp, opcode);
1068        frx = qp->rx_fpdu;
1069
1070        /*
1071         * Figure out len of current hdr: variable length of
1072         * iwarp hdr may force us to copy hdr information in
1073         * two steps. Only tagged DDP messages are already
1074         * completely received.
1075         */
1076        if (iwarp_pktinfo[opcode].hdr_len > sizeof(struct iwarp_ctrl_tagged)) {
1077                bytes = iwarp_pktinfo[opcode].hdr_len - MIN_DDP_HDR;
1078
1079                if (srx->skb_new < bytes)
1080                        return -EAGAIN;
1081
1082                skb_copy_bits(skb, srx->skb_offset,
1083                              (char *)c_hdr + srx->fpdu_part_rcvd, bytes);
1084
1085                srx->fpdu_part_rcvd += bytes;
1086
1087                srx->skb_new -= bytes;
1088                srx->skb_offset += bytes;
1089                srx->skb_copied += bytes;
1090        }
1091
1092        /*
1093         * DDP/RDMAP header receive completed. Check if the current
1094         * DDP segment starts a new RDMAP message or continues a previously
1095         * started RDMAP message.
1096         *
1097         * Alternating reception of DDP segments (or FPDUs) from incomplete
1098         * tagged and untagged RDMAP messages is supported, as long as
1099         * the current tagged or untagged message gets eventually completed
1100         * w/o intersection from another message of the same type
1101         * (tagged/untagged). E.g., a WRITE can get intersected by a SEND,
1102         * but not by a READ RESPONSE etc.
1103         */
1104        if (srx->mpa_crc_hd) {
1105                /*
1106                 * Restart CRC computation
1107                 */
1108                crypto_shash_init(srx->mpa_crc_hd);
1109                crypto_shash_update(srx->mpa_crc_hd, (u8 *)c_hdr,
1110                                    srx->fpdu_part_rcvd);
1111        }
1112        if (frx->more_ddp_segs) {
1113                frx->first_ddp_seg = 0;
1114                if (frx->prev_rdmap_op != opcode) {
1115                        pr_warn("siw: packet intersection: %u : %u\n",
1116                                frx->prev_rdmap_op, opcode);
1117                        /*
1118                         * The last inbound RDMA operation of same type
1119                         * (tagged or untagged) is left unfinished.
1120                         * To complete it in error, make it the current
1121                         * operation again, even with the header already
1122                         * overwritten. For error handling, only the opcode
1123                         * and current rx context are relevant.
1124                         */
1125                        set_rx_fpdu_context(qp, frx->prev_rdmap_op);
1126                        __rdmap_set_opcode(c_hdr, frx->prev_rdmap_op);
1127                        return -EPROTO;
1128                }
1129        } else {
1130                frx->prev_rdmap_op = opcode;
1131                frx->first_ddp_seg = 1;
1132        }
1133        frx->more_ddp_segs = c_hdr->ddp_rdmap_ctrl & DDP_FLAG_LAST ? 0 : 1;
1134
1135        return 0;
1136}
1137
1138static int siw_check_tx_fence(struct siw_qp *qp)
1139{
1140        struct siw_wqe *tx_waiting = tx_wqe(qp);
1141        struct siw_sqe *rreq;
1142        int resume_tx = 0, rv = 0;
1143        unsigned long flags;
1144
1145        spin_lock_irqsave(&qp->orq_lock, flags);
1146
1147        rreq = orq_get_current(qp);
1148
1149        /* free current orq entry */
1150        WRITE_ONCE(rreq->flags, 0);
1151
1152        if (qp->tx_ctx.orq_fence) {
1153                if (unlikely(tx_waiting->wr_status != SIW_WR_QUEUED)) {
1154                        pr_warn("siw: [QP %u]: fence resume: bad status %d\n",
1155                                qp_id(qp), tx_waiting->wr_status);
1156                        rv = -EPROTO;
1157                        goto out;
1158                }
1159                /* resume SQ processing */
1160                if (tx_waiting->sqe.opcode == SIW_OP_READ ||
1161                    tx_waiting->sqe.opcode == SIW_OP_READ_LOCAL_INV) {
1162                        rreq = orq_get_tail(qp);
1163                        if (unlikely(!rreq)) {
1164                                pr_warn("siw: [QP %u]: no ORQE\n", qp_id(qp));
1165                                rv = -EPROTO;
1166                                goto out;
1167                        }
1168                        siw_read_to_orq(rreq, &tx_waiting->sqe);
1169
1170                        qp->orq_put++;
1171                        qp->tx_ctx.orq_fence = 0;
1172                        resume_tx = 1;
1173
1174                } else if (siw_orq_empty(qp)) {
1175                        qp->tx_ctx.orq_fence = 0;
1176                        resume_tx = 1;
1177                } else {
1178                        pr_warn("siw: [QP %u]: fence resume: orq idx: %d:%d\n",
1179                                qp_id(qp), qp->orq_get, qp->orq_put);
1180                        rv = -EPROTO;
1181                }
1182        }
1183        qp->orq_get++;
1184out:
1185        spin_unlock_irqrestore(&qp->orq_lock, flags);
1186
1187        if (resume_tx)
1188                rv = siw_sq_start(qp);
1189
1190        return rv;
1191}
1192
1193/*
1194 * siw_rdmap_complete()
1195 *
1196 * Complete processing of an RDMA message after receiving all
1197 * DDP segmens or ABort processing after encountering error case.
1198 *
1199 *   o SENDs + RRESPs will need for completion,
1200 *   o RREQs need for  READ RESPONSE initialization
1201 *   o WRITEs need memory dereferencing
1202 *
1203 * TODO: Failed WRITEs need local error to be surfaced.
1204 */
1205static int siw_rdmap_complete(struct siw_qp *qp, int error)
1206{
1207        struct siw_rx_stream *srx = &qp->rx_stream;
1208        struct siw_wqe *wqe = rx_wqe(qp->rx_fpdu);
1209        enum siw_wc_status wc_status = wqe->wc_status;
1210        u8 opcode = __rdmap_get_opcode(&srx->hdr.ctrl);
1211        int rv = 0;
1212
1213        switch (opcode) {
1214        case RDMAP_SEND_SE:
1215        case RDMAP_SEND_SE_INVAL:
1216                wqe->rqe.flags |= SIW_WQE_SOLICITED;
1217                /* Fall through */
1218
1219        case RDMAP_SEND:
1220        case RDMAP_SEND_INVAL:
1221                if (wqe->wr_status == SIW_WR_IDLE)
1222                        break;
1223
1224                srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]++;
1225
1226                if (error != 0 && wc_status == SIW_WC_SUCCESS)
1227                        wc_status = SIW_WC_GENERAL_ERR;
1228                /*
1229                 * Handle STag invalidation request
1230                 */
1231                if (wc_status == SIW_WC_SUCCESS &&
1232                    (opcode == RDMAP_SEND_INVAL ||
1233                     opcode == RDMAP_SEND_SE_INVAL)) {
1234                        rv = siw_invalidate_stag(qp->pd, srx->inval_stag);
1235                        if (rv) {
1236                                siw_init_terminate(
1237                                        qp, TERM_ERROR_LAYER_RDMAP,
1238                                        rv == -EACCES ?
1239                                                RDMAP_ETYPE_REMOTE_PROTECTION :
1240                                                RDMAP_ETYPE_REMOTE_OPERATION,
1241                                        RDMAP_ECODE_CANNOT_INVALIDATE, 0);
1242
1243                                wc_status = SIW_WC_REM_INV_REQ_ERR;
1244                        }
1245                        rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed,
1246                                              rv ? 0 : srx->inval_stag,
1247                                              wc_status);
1248                } else {
1249                        rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed,
1250                                              0, wc_status);
1251                }
1252                siw_wqe_put_mem(wqe, SIW_OP_RECEIVE);
1253                break;
1254
1255        case RDMAP_RDMA_READ_RESP:
1256                if (wqe->wr_status == SIW_WR_IDLE)
1257                        break;
1258
1259                if (error != 0) {
1260                        if ((srx->state == SIW_GET_HDR &&
1261                             qp->rx_fpdu->first_ddp_seg) || error == -ENODATA)
1262                                /* possible RREQ in ORQ left untouched */
1263                                break;
1264
1265                        if (wc_status == SIW_WC_SUCCESS)
1266                                wc_status = SIW_WC_GENERAL_ERR;
1267                } else if (qp->kernel_verbs &&
1268                           rx_type(wqe) == SIW_OP_READ_LOCAL_INV) {
1269                        /*
1270                         * Handle any STag invalidation request
1271                         */
1272                        rv = siw_invalidate_stag(qp->pd, wqe->sqe.sge[0].lkey);
1273                        if (rv) {
1274                                siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
1275                                                   RDMAP_ETYPE_CATASTROPHIC,
1276                                                   RDMAP_ECODE_UNSPECIFIED, 0);
1277
1278                                if (wc_status == SIW_WC_SUCCESS) {
1279                                        wc_status = SIW_WC_GENERAL_ERR;
1280                                        error = rv;
1281                                }
1282                        }
1283                }
1284                /*
1285                 * All errors turn the wqe into signalled.
1286                 */
1287                if ((wqe->sqe.flags & SIW_WQE_SIGNALLED) || error != 0)
1288                        rv = siw_sqe_complete(qp, &wqe->sqe, wqe->processed,
1289                                              wc_status);
1290                siw_wqe_put_mem(wqe, SIW_OP_READ);
1291
1292                if (!error)
1293                        rv = siw_check_tx_fence(qp);
1294                else
1295                        /* Disable current ORQ eleement */
1296                        WRITE_ONCE(orq_get_current(qp)->flags, 0);
1297                break;
1298
1299        case RDMAP_RDMA_READ_REQ:
1300                if (!error) {
1301                        rv = siw_init_rresp(qp, srx);
1302                        srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]++;
1303                }
1304                break;
1305
1306        case RDMAP_RDMA_WRITE:
1307                if (wqe->wr_status == SIW_WR_IDLE)
1308                        break;
1309
1310                /*
1311                 * Free References from memory object if
1312                 * attached to receive context (inbound WRITE).
1313                 * While a zero-length WRITE is allowed,
1314                 * no memory reference got created.
1315                 */
1316                if (rx_mem(&qp->rx_tagged)) {
1317                        siw_mem_put(rx_mem(&qp->rx_tagged));
1318                        rx_mem(&qp->rx_tagged) = NULL;
1319                }
1320                break;
1321
1322        default:
1323                break;
1324        }
1325        wqe->wr_status = SIW_WR_IDLE;
1326
1327        return rv;
1328}
1329
1330/*
1331 * siw_tcp_rx_data()
1332 *
1333 * Main routine to consume inbound TCP payload
1334 *
1335 * @rd_desc:    read descriptor
1336 * @skb:        socket buffer
1337 * @off:        offset in skb
1338 * @len:        skb->len - offset : payload in skb
1339 */
1340int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb,
1341                    unsigned int off, size_t len)
1342{
1343        struct siw_qp *qp = rd_desc->arg.data;
1344        struct siw_rx_stream *srx = &qp->rx_stream;
1345        int rv;
1346
1347        srx->skb = skb;
1348        srx->skb_new = skb->len - off;
1349        srx->skb_offset = off;
1350        srx->skb_copied = 0;
1351
1352        siw_dbg_qp(qp, "new data, len %d\n", srx->skb_new);
1353
1354        while (srx->skb_new) {
1355                int run_completion = 1;
1356
1357                if (unlikely(srx->rx_suspend)) {
1358                        /* Do not process any more data */
1359                        srx->skb_copied += srx->skb_new;
1360                        break;
1361                }
1362                switch (srx->state) {
1363                case SIW_GET_HDR:
1364                        rv = siw_get_hdr(srx);
1365                        if (!rv) {
1366                                srx->fpdu_part_rem =
1367                                        be16_to_cpu(srx->hdr.ctrl.mpa_len) -
1368                                        srx->fpdu_part_rcvd + MPA_HDR_SIZE;
1369
1370                                if (srx->fpdu_part_rem)
1371                                        srx->pad = -srx->fpdu_part_rem & 0x3;
1372                                else
1373                                        srx->pad = 0;
1374
1375                                srx->state = SIW_GET_DATA_START;
1376                                srx->fpdu_part_rcvd = 0;
1377                        }
1378                        break;
1379
1380                case SIW_GET_DATA_MORE:
1381                        /*
1382                         * Another data fragment of the same DDP segment.
1383                         * Setting first_ddp_seg = 0 avoids repeating
1384                         * initializations that shall occur only once per
1385                         * DDP segment.
1386                         */
1387                        qp->rx_fpdu->first_ddp_seg = 0;
1388                        /* Fall through */
1389
1390                case SIW_GET_DATA_START:
1391                        /*
1392                         * Headers will be checked by the opcode-specific
1393                         * data receive function below.
1394                         */
1395                        rv = iwarp_pktinfo[qp->rx_stream.rdmap_op].rx_data(qp);
1396                        if (!rv) {
1397                                int mpa_len =
1398                                        be16_to_cpu(srx->hdr.ctrl.mpa_len)
1399                                        + MPA_HDR_SIZE;
1400
1401                                srx->fpdu_part_rem = (-mpa_len & 0x3)
1402                                                      + MPA_CRC_SIZE;
1403                                srx->fpdu_part_rcvd = 0;
1404                                srx->state = SIW_GET_TRAILER;
1405                        } else {
1406                                if (unlikely(rv == -ECONNRESET))
1407                                        run_completion = 0;
1408                                else
1409                                        srx->state = SIW_GET_DATA_MORE;
1410                        }
1411                        break;
1412
1413                case SIW_GET_TRAILER:
1414                        /*
1415                         * read CRC + any padding
1416                         */
1417                        rv = siw_get_trailer(qp, srx);
1418                        if (likely(!rv)) {
1419                                /*
1420                                 * FPDU completed.
1421                                 * complete RDMAP message if last fragment
1422                                 */
1423                                srx->state = SIW_GET_HDR;
1424                                srx->fpdu_part_rcvd = 0;
1425
1426                                if (!(srx->hdr.ctrl.ddp_rdmap_ctrl &
1427                                      DDP_FLAG_LAST))
1428                                        /* more frags */
1429                                        break;
1430
1431                                rv = siw_rdmap_complete(qp, 0);
1432                                run_completion = 0;
1433                        }
1434                        break;
1435
1436                default:
1437                        pr_warn("QP[%u]: RX out of state\n", qp_id(qp));
1438                        rv = -EPROTO;
1439                        run_completion = 0;
1440                }
1441                if (unlikely(rv != 0 && rv != -EAGAIN)) {
1442                        if ((srx->state > SIW_GET_HDR ||
1443                             qp->rx_fpdu->more_ddp_segs) && run_completion)
1444                                siw_rdmap_complete(qp, rv);
1445
1446                        siw_dbg_qp(qp, "rx error %d, rx state %d\n", rv,
1447                                   srx->state);
1448
1449                        siw_qp_cm_drop(qp, 1);
1450
1451                        break;
1452                }
1453                if (rv) {
1454                        siw_dbg_qp(qp, "fpdu fragment, state %d, missing %d\n",
1455                                   srx->state, srx->fpdu_part_rem);
1456                        break;
1457                }
1458        }
1459        return srx->skb_copied;
1460}
1461