linux/drivers/infiniband/sw/siw/siw_qp_rx.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
   2
   3/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
   4/* Copyright (c) 2008-2019, IBM Corporation */
   5
   6#include <linux/errno.h>
   7#include <linux/types.h>
   8#include <linux/net.h>
   9#include <linux/scatterlist.h>
  10#include <linux/highmem.h>
  11
  12#include <rdma/iw_cm.h>
  13#include <rdma/ib_verbs.h>
  14
  15#include "siw.h"
  16#include "siw_verbs.h"
  17#include "siw_mem.h"
  18
  19/*
  20 * siw_rx_umem()
  21 *
  22 * Receive data of @len into target referenced by @dest_addr.
  23 *
  24 * @srx:        Receive Context
  25 * @umem:       siw representation of target memory
  26 * @dest_addr:  user virtual address
  27 * @len:        number of bytes to place
  28 */
  29static int siw_rx_umem(struct siw_rx_stream *srx, struct siw_umem *umem,
  30                       u64 dest_addr, int len)
  31{
  32        int copied = 0;
  33
  34        while (len) {
  35                struct page *p;
  36                int pg_off, bytes, rv;
  37                void *dest;
  38
  39                p = siw_get_upage(umem, dest_addr);
  40                if (unlikely(!p)) {
  41                        pr_warn("siw: %s: [QP %u]: bogus addr: %pK, %pK\n",
  42                                __func__, qp_id(rx_qp(srx)),
  43                                (void *)(uintptr_t)dest_addr,
  44                                (void *)(uintptr_t)umem->fp_addr);
  45                        /* siw internal error */
  46                        srx->skb_copied += copied;
  47                        srx->skb_new -= copied;
  48
  49                        return -EFAULT;
  50                }
  51                pg_off = dest_addr & ~PAGE_MASK;
  52                bytes = min(len, (int)PAGE_SIZE - pg_off);
  53
  54                siw_dbg_qp(rx_qp(srx), "page %pK, bytes=%u\n", p, bytes);
  55
  56                dest = kmap_atomic(p);
  57                rv = skb_copy_bits(srx->skb, srx->skb_offset, dest + pg_off,
  58                                   bytes);
  59
  60                if (unlikely(rv)) {
  61                        kunmap_atomic(dest);
  62                        srx->skb_copied += copied;
  63                        srx->skb_new -= copied;
  64
  65                        pr_warn("siw: [QP %u]: %s, len %d, page %p, rv %d\n",
  66                                qp_id(rx_qp(srx)), __func__, len, p, rv);
  67
  68                        return -EFAULT;
  69                }
  70                if (srx->mpa_crc_hd) {
  71                        if (rdma_is_kernel_res(&rx_qp(srx)->base_qp.res)) {
  72                                crypto_shash_update(srx->mpa_crc_hd,
  73                                        (u8 *)(dest + pg_off), bytes);
  74                                kunmap_atomic(dest);
  75                        } else {
  76                                kunmap_atomic(dest);
  77                                /*
  78                                 * Do CRC on original, not target buffer.
  79                                 * Some user land applications may
  80                                 * concurrently write the target buffer,
  81                                 * which would yield a broken CRC.
  82                                 * Walking the skb twice is very ineffcient.
  83                                 * Folding the CRC into skb_copy_bits()
  84                                 * would be much better, but is currently
  85                                 * not supported.
  86                                 */
  87                                siw_crc_skb(srx, bytes);
  88                        }
  89                } else {
  90                        kunmap_atomic(dest);
  91                }
  92                srx->skb_offset += bytes;
  93                copied += bytes;
  94                len -= bytes;
  95                dest_addr += bytes;
  96                pg_off = 0;
  97        }
  98        srx->skb_copied += copied;
  99        srx->skb_new -= copied;
 100
 101        return copied;
 102}
 103
 104static int siw_rx_kva(struct siw_rx_stream *srx, void *kva, int len)
 105{
 106        int rv;
 107
 108        siw_dbg_qp(rx_qp(srx), "kva: 0x%pK, len: %u\n", kva, len);
 109
 110        rv = skb_copy_bits(srx->skb, srx->skb_offset, kva, len);
 111        if (unlikely(rv)) {
 112                pr_warn("siw: [QP %u]: %s, len %d, kva 0x%pK, rv %d\n",
 113                        qp_id(rx_qp(srx)), __func__, len, kva, rv);
 114
 115                return rv;
 116        }
 117        if (srx->mpa_crc_hd)
 118                crypto_shash_update(srx->mpa_crc_hd, (u8 *)kva, len);
 119
 120        srx->skb_offset += len;
 121        srx->skb_copied += len;
 122        srx->skb_new -= len;
 123
 124        return len;
 125}
 126
 127static int siw_rx_pbl(struct siw_rx_stream *srx, int *pbl_idx,
 128                      struct siw_mem *mem, u64 addr, int len)
 129{
 130        struct siw_pbl *pbl = mem->pbl;
 131        u64 offset = addr - mem->va;
 132        int copied = 0;
 133
 134        while (len) {
 135                int bytes;
 136                dma_addr_t buf_addr =
 137                        siw_pbl_get_buffer(pbl, offset, &bytes, pbl_idx);
 138                if (!buf_addr)
 139                        break;
 140
 141                bytes = min(bytes, len);
 142                if (siw_rx_kva(srx, (void *)(uintptr_t)buf_addr, bytes) ==
 143                    bytes) {
 144                        copied += bytes;
 145                        offset += bytes;
 146                        len -= bytes;
 147                } else {
 148                        break;
 149                }
 150        }
 151        return copied;
 152}
 153
 154/*
 155 * siw_rresp_check_ntoh()
 156 *
 157 * Check incoming RRESP fragment header against expected
 158 * header values and update expected values for potential next
 159 * fragment.
 160 *
 161 * NOTE: This function must be called only if a RRESP DDP segment
 162 *       starts but not for fragmented consecutive pieces of an
 163 *       already started DDP segment.
 164 */
 165static int siw_rresp_check_ntoh(struct siw_rx_stream *srx,
 166                                struct siw_rx_fpdu *frx)
 167{
 168        struct iwarp_rdma_rresp *rresp = &srx->hdr.rresp;
 169        struct siw_wqe *wqe = &frx->wqe_active;
 170        enum ddp_ecode ecode;
 171
 172        u32 sink_stag = be32_to_cpu(rresp->sink_stag);
 173        u64 sink_to = be64_to_cpu(rresp->sink_to);
 174
 175        if (frx->first_ddp_seg) {
 176                srx->ddp_stag = wqe->sqe.sge[0].lkey;
 177                srx->ddp_to = wqe->sqe.sge[0].laddr;
 178                frx->pbl_idx = 0;
 179        }
 180        /* Below checks extend beyond the semantics of DDP, and
 181         * into RDMAP:
 182         * We check if the read response matches exactly the
 183         * read request which was send to the remote peer to
 184         * trigger this read response. RFC5040/5041 do not
 185         * always have a proper error code for the detected
 186         * error cases. We choose 'base or bounds error' for
 187         * cases where the inbound STag is valid, but offset
 188         * or length do not match our response receive state.
 189         */
 190        if (unlikely(srx->ddp_stag != sink_stag)) {
 191                pr_warn("siw: [QP %u]: rresp stag: %08x != %08x\n",
 192                        qp_id(rx_qp(srx)), sink_stag, srx->ddp_stag);
 193                ecode = DDP_ECODE_T_INVALID_STAG;
 194                goto error;
 195        }
 196        if (unlikely(srx->ddp_to != sink_to)) {
 197                pr_warn("siw: [QP %u]: rresp off: %016llx != %016llx\n",
 198                        qp_id(rx_qp(srx)), (unsigned long long)sink_to,
 199                        (unsigned long long)srx->ddp_to);
 200                ecode = DDP_ECODE_T_BASE_BOUNDS;
 201                goto error;
 202        }
 203        if (unlikely(!frx->more_ddp_segs &&
 204                     (wqe->processed + srx->fpdu_part_rem != wqe->bytes))) {
 205                pr_warn("siw: [QP %u]: rresp len: %d != %d\n",
 206                        qp_id(rx_qp(srx)),
 207                        wqe->processed + srx->fpdu_part_rem, wqe->bytes);
 208                ecode = DDP_ECODE_T_BASE_BOUNDS;
 209                goto error;
 210        }
 211        return 0;
 212error:
 213        siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
 214                           DDP_ETYPE_TAGGED_BUF, ecode, 0);
 215        return -EINVAL;
 216}
 217
 218/*
 219 * siw_write_check_ntoh()
 220 *
 221 * Check incoming WRITE fragment header against expected
 222 * header values and update expected values for potential next
 223 * fragment
 224 *
 225 * NOTE: This function must be called only if a WRITE DDP segment
 226 *       starts but not for fragmented consecutive pieces of an
 227 *       already started DDP segment.
 228 */
 229static int siw_write_check_ntoh(struct siw_rx_stream *srx,
 230                                struct siw_rx_fpdu *frx)
 231{
 232        struct iwarp_rdma_write *write = &srx->hdr.rwrite;
 233        enum ddp_ecode ecode;
 234
 235        u32 sink_stag = be32_to_cpu(write->sink_stag);
 236        u64 sink_to = be64_to_cpu(write->sink_to);
 237
 238        if (frx->first_ddp_seg) {
 239                srx->ddp_stag = sink_stag;
 240                srx->ddp_to = sink_to;
 241                frx->pbl_idx = 0;
 242        } else {
 243                if (unlikely(srx->ddp_stag != sink_stag)) {
 244                        pr_warn("siw: [QP %u]: write stag: %08x != %08x\n",
 245                                qp_id(rx_qp(srx)), sink_stag,
 246                                srx->ddp_stag);
 247                        ecode = DDP_ECODE_T_INVALID_STAG;
 248                        goto error;
 249                }
 250                if (unlikely(srx->ddp_to != sink_to)) {
 251                        pr_warn("siw: [QP %u]: write off: %016llx != %016llx\n",
 252                                qp_id(rx_qp(srx)),
 253                                (unsigned long long)sink_to,
 254                                (unsigned long long)srx->ddp_to);
 255                        ecode = DDP_ECODE_T_BASE_BOUNDS;
 256                        goto error;
 257                }
 258        }
 259        return 0;
 260error:
 261        siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
 262                           DDP_ETYPE_TAGGED_BUF, ecode, 0);
 263        return -EINVAL;
 264}
 265
 266/*
 267 * siw_send_check_ntoh()
 268 *
 269 * Check incoming SEND fragment header against expected
 270 * header values and update expected MSN if no next
 271 * fragment expected
 272 *
 273 * NOTE: This function must be called only if a SEND DDP segment
 274 *       starts but not for fragmented consecutive pieces of an
 275 *       already started DDP segment.
 276 */
 277static int siw_send_check_ntoh(struct siw_rx_stream *srx,
 278                               struct siw_rx_fpdu *frx)
 279{
 280        struct iwarp_send_inv *send = &srx->hdr.send_inv;
 281        struct siw_wqe *wqe = &frx->wqe_active;
 282        enum ddp_ecode ecode;
 283
 284        u32 ddp_msn = be32_to_cpu(send->ddp_msn);
 285        u32 ddp_mo = be32_to_cpu(send->ddp_mo);
 286        u32 ddp_qn = be32_to_cpu(send->ddp_qn);
 287
 288        if (unlikely(ddp_qn != RDMAP_UNTAGGED_QN_SEND)) {
 289                pr_warn("siw: [QP %u]: invalid ddp qn %d for send\n",
 290                        qp_id(rx_qp(srx)), ddp_qn);
 291                ecode = DDP_ECODE_UT_INVALID_QN;
 292                goto error;
 293        }
 294        if (unlikely(ddp_msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND])) {
 295                pr_warn("siw: [QP %u]: send msn: %u != %u\n",
 296                        qp_id(rx_qp(srx)), ddp_msn,
 297                        srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]);
 298                ecode = DDP_ECODE_UT_INVALID_MSN_RANGE;
 299                goto error;
 300        }
 301        if (unlikely(ddp_mo != wqe->processed)) {
 302                pr_warn("siw: [QP %u], send mo: %u != %u\n",
 303                        qp_id(rx_qp(srx)), ddp_mo, wqe->processed);
 304                ecode = DDP_ECODE_UT_INVALID_MO;
 305                goto error;
 306        }
 307        if (frx->first_ddp_seg) {
 308                /* initialize user memory write position */
 309                frx->sge_idx = 0;
 310                frx->sge_off = 0;
 311                frx->pbl_idx = 0;
 312
 313                /* only valid for SEND_INV and SEND_SE_INV operations */
 314                srx->inval_stag = be32_to_cpu(send->inval_stag);
 315        }
 316        if (unlikely(wqe->bytes < wqe->processed + srx->fpdu_part_rem)) {
 317                siw_dbg_qp(rx_qp(srx), "receive space short: %d - %d < %d\n",
 318                           wqe->bytes, wqe->processed, srx->fpdu_part_rem);
 319                wqe->wc_status = SIW_WC_LOC_LEN_ERR;
 320                ecode = DDP_ECODE_UT_INVALID_MSN_NOBUF;
 321                goto error;
 322        }
 323        return 0;
 324error:
 325        siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
 326                           DDP_ETYPE_UNTAGGED_BUF, ecode, 0);
 327        return -EINVAL;
 328}
 329
 330static struct siw_wqe *siw_rqe_get(struct siw_qp *qp)
 331{
 332        struct siw_rqe *rqe;
 333        struct siw_srq *srq;
 334        struct siw_wqe *wqe = NULL;
 335        bool srq_event = false;
 336        unsigned long flags;
 337
 338        srq = qp->srq;
 339        if (srq) {
 340                spin_lock_irqsave(&srq->lock, flags);
 341                if (unlikely(!srq->num_rqe))
 342                        goto out;
 343
 344                rqe = &srq->recvq[srq->rq_get % srq->num_rqe];
 345        } else {
 346                if (unlikely(!qp->recvq))
 347                        goto out;
 348
 349                rqe = &qp->recvq[qp->rq_get % qp->attrs.rq_size];
 350        }
 351        if (likely(rqe->flags == SIW_WQE_VALID)) {
 352                int num_sge = rqe->num_sge;
 353
 354                if (likely(num_sge <= SIW_MAX_SGE)) {
 355                        int i = 0;
 356
 357                        wqe = rx_wqe(&qp->rx_untagged);
 358                        rx_type(wqe) = SIW_OP_RECEIVE;
 359                        wqe->wr_status = SIW_WR_INPROGRESS;
 360                        wqe->bytes = 0;
 361                        wqe->processed = 0;
 362
 363                        wqe->rqe.id = rqe->id;
 364                        wqe->rqe.num_sge = num_sge;
 365
 366                        while (i < num_sge) {
 367                                wqe->rqe.sge[i].laddr = rqe->sge[i].laddr;
 368                                wqe->rqe.sge[i].lkey = rqe->sge[i].lkey;
 369                                wqe->rqe.sge[i].length = rqe->sge[i].length;
 370                                wqe->bytes += wqe->rqe.sge[i].length;
 371                                wqe->mem[i] = NULL;
 372                                i++;
 373                        }
 374                        /* can be re-used by appl */
 375                        smp_store_mb(rqe->flags, 0);
 376                } else {
 377                        siw_dbg_qp(qp, "too many sge's: %d\n", rqe->num_sge);
 378                        if (srq)
 379                                spin_unlock_irqrestore(&srq->lock, flags);
 380                        return NULL;
 381                }
 382                if (!srq) {
 383                        qp->rq_get++;
 384                } else {
 385                        if (srq->armed) {
 386                                /* Test SRQ limit */
 387                                u32 off = (srq->rq_get + srq->limit) %
 388                                          srq->num_rqe;
 389                                struct siw_rqe *rqe2 = &srq->recvq[off];
 390
 391                                if (!(rqe2->flags & SIW_WQE_VALID)) {
 392                                        srq->armed = false;
 393                                        srq_event = true;
 394                                }
 395                        }
 396                        srq->rq_get++;
 397                }
 398        }
 399out:
 400        if (srq) {
 401                spin_unlock_irqrestore(&srq->lock, flags);
 402                if (srq_event)
 403                        siw_srq_event(srq, IB_EVENT_SRQ_LIMIT_REACHED);
 404        }
 405        return wqe;
 406}
 407
 408/*
 409 * siw_proc_send:
 410 *
 411 * Process one incoming SEND and place data into memory referenced by
 412 * receive wqe.
 413 *
 414 * Function supports partially received sends (suspending/resuming
 415 * current receive wqe processing)
 416 *
 417 * return value:
 418 *      0:       reached the end of a DDP segment
 419 *      -EAGAIN: to be called again to finish the DDP segment
 420 */
 421int siw_proc_send(struct siw_qp *qp)
 422{
 423        struct siw_rx_stream *srx = &qp->rx_stream;
 424        struct siw_rx_fpdu *frx = &qp->rx_untagged;
 425        struct siw_wqe *wqe;
 426        u32 data_bytes; /* all data bytes available */
 427        u32 rcvd_bytes; /* sum of data bytes rcvd */
 428        int rv = 0;
 429
 430        if (frx->first_ddp_seg) {
 431                wqe = siw_rqe_get(qp);
 432                if (unlikely(!wqe)) {
 433                        siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
 434                                           DDP_ETYPE_UNTAGGED_BUF,
 435                                           DDP_ECODE_UT_INVALID_MSN_NOBUF, 0);
 436                        return -ENOENT;
 437                }
 438        } else {
 439                wqe = rx_wqe(frx);
 440        }
 441        if (srx->state == SIW_GET_DATA_START) {
 442                rv = siw_send_check_ntoh(srx, frx);
 443                if (unlikely(rv)) {
 444                        siw_qp_event(qp, IB_EVENT_QP_FATAL);
 445                        return rv;
 446                }
 447                if (!srx->fpdu_part_rem) /* zero length SEND */
 448                        return 0;
 449        }
 450        data_bytes = min(srx->fpdu_part_rem, srx->skb_new);
 451        rcvd_bytes = 0;
 452
 453        /* A zero length SEND will skip below loop */
 454        while (data_bytes) {
 455                struct ib_pd *pd;
 456                struct siw_mem **mem, *mem_p;
 457                struct siw_sge *sge;
 458                u32 sge_bytes; /* data bytes avail for SGE */
 459
 460                sge = &wqe->rqe.sge[frx->sge_idx];
 461
 462                if (!sge->length) {
 463                        /* just skip empty sge's */
 464                        frx->sge_idx++;
 465                        frx->sge_off = 0;
 466                        frx->pbl_idx = 0;
 467                        continue;
 468                }
 469                sge_bytes = min(data_bytes, sge->length - frx->sge_off);
 470                mem = &wqe->mem[frx->sge_idx];
 471
 472                /*
 473                 * check with QP's PD if no SRQ present, SRQ's PD otherwise
 474                 */
 475                pd = qp->srq == NULL ? qp->pd : qp->srq->base_srq.pd;
 476
 477                rv = siw_check_sge(pd, sge, mem, IB_ACCESS_LOCAL_WRITE,
 478                                   frx->sge_off, sge_bytes);
 479                if (unlikely(rv)) {
 480                        siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
 481                                           DDP_ETYPE_CATASTROPHIC,
 482                                           DDP_ECODE_CATASTROPHIC, 0);
 483
 484                        siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
 485                        break;
 486                }
 487                mem_p = *mem;
 488                if (mem_p->mem_obj == NULL)
 489                        rv = siw_rx_kva(srx,
 490                                (void *)(uintptr_t)(sge->laddr + frx->sge_off),
 491                                sge_bytes);
 492                else if (!mem_p->is_pbl)
 493                        rv = siw_rx_umem(srx, mem_p->umem,
 494                                         sge->laddr + frx->sge_off, sge_bytes);
 495                else
 496                        rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p,
 497                                        sge->laddr + frx->sge_off, sge_bytes);
 498
 499                if (unlikely(rv != sge_bytes)) {
 500                        wqe->processed += rcvd_bytes;
 501
 502                        siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
 503                                           DDP_ETYPE_CATASTROPHIC,
 504                                           DDP_ECODE_CATASTROPHIC, 0);
 505                        return -EINVAL;
 506                }
 507                frx->sge_off += rv;
 508
 509                if (frx->sge_off == sge->length) {
 510                        frx->sge_idx++;
 511                        frx->sge_off = 0;
 512                        frx->pbl_idx = 0;
 513                }
 514                data_bytes -= rv;
 515                rcvd_bytes += rv;
 516
 517                srx->fpdu_part_rem -= rv;
 518                srx->fpdu_part_rcvd += rv;
 519        }
 520        wqe->processed += rcvd_bytes;
 521
 522        if (!srx->fpdu_part_rem)
 523                return 0;
 524
 525        return (rv < 0) ? rv : -EAGAIN;
 526}
 527
 528/*
 529 * siw_proc_write:
 530 *
 531 * Place incoming WRITE after referencing and checking target buffer
 532
 533 * Function supports partially received WRITEs (suspending/resuming
 534 * current receive processing)
 535 *
 536 * return value:
 537 *      0:       reached the end of a DDP segment
 538 *      -EAGAIN: to be called again to finish the DDP segment
 539 */
 540int siw_proc_write(struct siw_qp *qp)
 541{
 542        struct siw_rx_stream *srx = &qp->rx_stream;
 543        struct siw_rx_fpdu *frx = &qp->rx_tagged;
 544        struct siw_mem *mem;
 545        int bytes, rv;
 546
 547        if (srx->state == SIW_GET_DATA_START) {
 548                if (!srx->fpdu_part_rem) /* zero length WRITE */
 549                        return 0;
 550
 551                rv = siw_write_check_ntoh(srx, frx);
 552                if (unlikely(rv)) {
 553                        siw_qp_event(qp, IB_EVENT_QP_FATAL);
 554                        return rv;
 555                }
 556        }
 557        bytes = min(srx->fpdu_part_rem, srx->skb_new);
 558
 559        if (frx->first_ddp_seg) {
 560                struct siw_wqe *wqe = rx_wqe(frx);
 561
 562                rx_mem(frx) = siw_mem_id2obj(qp->sdev, srx->ddp_stag >> 8);
 563                if (unlikely(!rx_mem(frx))) {
 564                        siw_dbg_qp(qp,
 565                                   "sink stag not found/invalid, stag 0x%08x\n",
 566                                   srx->ddp_stag);
 567
 568                        siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
 569                                           DDP_ETYPE_TAGGED_BUF,
 570                                           DDP_ECODE_T_INVALID_STAG, 0);
 571                        return -EINVAL;
 572                }
 573                wqe->rqe.num_sge = 1;
 574                rx_type(wqe) = SIW_OP_WRITE;
 575                wqe->wr_status = SIW_WR_INPROGRESS;
 576        }
 577        mem = rx_mem(frx);
 578
 579        /*
 580         * Check if application re-registered memory with different
 581         * key field of STag.
 582         */
 583        if (unlikely(mem->stag != srx->ddp_stag)) {
 584                siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
 585                                   DDP_ETYPE_TAGGED_BUF,
 586                                   DDP_ECODE_T_INVALID_STAG, 0);
 587                return -EINVAL;
 588        }
 589        rv = siw_check_mem(qp->pd, mem, srx->ddp_to + srx->fpdu_part_rcvd,
 590                           IB_ACCESS_REMOTE_WRITE, bytes);
 591        if (unlikely(rv)) {
 592                siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
 593                                   DDP_ETYPE_TAGGED_BUF, siw_tagged_error(-rv),
 594                                   0);
 595
 596                siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
 597
 598                return -EINVAL;
 599        }
 600
 601        if (mem->mem_obj == NULL)
 602                rv = siw_rx_kva(srx,
 603                        (void *)(uintptr_t)(srx->ddp_to + srx->fpdu_part_rcvd),
 604                        bytes);
 605        else if (!mem->is_pbl)
 606                rv = siw_rx_umem(srx, mem->umem,
 607                                 srx->ddp_to + srx->fpdu_part_rcvd, bytes);
 608        else
 609                rv = siw_rx_pbl(srx, &frx->pbl_idx, mem,
 610                                srx->ddp_to + srx->fpdu_part_rcvd, bytes);
 611
 612        if (unlikely(rv != bytes)) {
 613                siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
 614                                   DDP_ETYPE_CATASTROPHIC,
 615                                   DDP_ECODE_CATASTROPHIC, 0);
 616                return -EINVAL;
 617        }
 618        srx->fpdu_part_rem -= rv;
 619        srx->fpdu_part_rcvd += rv;
 620
 621        if (!srx->fpdu_part_rem) {
 622                srx->ddp_to += srx->fpdu_part_rcvd;
 623                return 0;
 624        }
 625        return -EAGAIN;
 626}
 627
 628/*
 629 * Inbound RREQ's cannot carry user data.
 630 */
 631int siw_proc_rreq(struct siw_qp *qp)
 632{
 633        struct siw_rx_stream *srx = &qp->rx_stream;
 634
 635        if (!srx->fpdu_part_rem)
 636                return 0;
 637
 638        pr_warn("siw: [QP %u]: rreq with mpa len %d\n", qp_id(qp),
 639                be16_to_cpu(srx->hdr.ctrl.mpa_len));
 640
 641        return -EPROTO;
 642}
 643
 644/*
 645 * siw_init_rresp:
 646 *
 647 * Process inbound RDMA READ REQ. Produce a pseudo READ RESPONSE WQE.
 648 * Put it at the tail of the IRQ, if there is another WQE currently in
 649 * transmit processing. If not, make it the current WQE to be processed
 650 * and schedule transmit processing.
 651 *
 652 * Can be called from softirq context and from process
 653 * context (RREAD socket loopback case!)
 654 *
 655 * return value:
 656 *      0:      success,
 657 *              failure code otherwise
 658 */
 659
 660static int siw_init_rresp(struct siw_qp *qp, struct siw_rx_stream *srx)
 661{
 662        struct siw_wqe *tx_work = tx_wqe(qp);
 663        struct siw_sqe *resp;
 664
 665        uint64_t raddr = be64_to_cpu(srx->hdr.rreq.sink_to),
 666                 laddr = be64_to_cpu(srx->hdr.rreq.source_to);
 667        uint32_t length = be32_to_cpu(srx->hdr.rreq.read_size),
 668                 lkey = be32_to_cpu(srx->hdr.rreq.source_stag),
 669                 rkey = be32_to_cpu(srx->hdr.rreq.sink_stag),
 670                 msn = be32_to_cpu(srx->hdr.rreq.ddp_msn);
 671
 672        int run_sq = 1, rv = 0;
 673        unsigned long flags;
 674
 675        if (unlikely(msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ])) {
 676                siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
 677                                   DDP_ETYPE_UNTAGGED_BUF,
 678                                   DDP_ECODE_UT_INVALID_MSN_RANGE, 0);
 679                return -EPROTO;
 680        }
 681        spin_lock_irqsave(&qp->sq_lock, flags);
 682
 683        if (tx_work->wr_status == SIW_WR_IDLE) {
 684                /*
 685                 * immediately schedule READ response w/o
 686                 * consuming IRQ entry: IRQ must be empty.
 687                 */
 688                tx_work->processed = 0;
 689                tx_work->mem[0] = NULL;
 690                tx_work->wr_status = SIW_WR_QUEUED;
 691                resp = &tx_work->sqe;
 692        } else {
 693                resp = irq_alloc_free(qp);
 694                run_sq = 0;
 695        }
 696        if (likely(resp)) {
 697                resp->opcode = SIW_OP_READ_RESPONSE;
 698
 699                resp->sge[0].length = length;
 700                resp->sge[0].laddr = laddr;
 701                resp->sge[0].lkey = lkey;
 702
 703                /* Keep aside message sequence number for potential
 704                 * error reporting during Read Response generation.
 705                 */
 706                resp->sge[1].length = msn;
 707
 708                resp->raddr = raddr;
 709                resp->rkey = rkey;
 710                resp->num_sge = length ? 1 : 0;
 711
 712                /* RRESP now valid as current TX wqe or placed into IRQ */
 713                smp_store_mb(resp->flags, SIW_WQE_VALID);
 714        } else {
 715                pr_warn("siw: [QP %u]: irq %d exceeded %d\n", qp_id(qp),
 716                        qp->irq_put % qp->attrs.irq_size, qp->attrs.irq_size);
 717
 718                siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
 719                                   RDMAP_ETYPE_REMOTE_OPERATION,
 720                                   RDMAP_ECODE_CATASTROPHIC_STREAM, 0);
 721                rv = -EPROTO;
 722        }
 723
 724        spin_unlock_irqrestore(&qp->sq_lock, flags);
 725
 726        if (run_sq)
 727                rv = siw_sq_start(qp);
 728
 729        return rv;
 730}
 731
 732/*
 733 * Only called at start of Read.Resonse processing.
 734 * Transfer pending Read from tip of ORQ into currrent rx wqe,
 735 * but keep ORQ entry valid until Read.Response processing done.
 736 * No Queue locking needed.
 737 */
 738static int siw_orqe_start_rx(struct siw_qp *qp)
 739{
 740        struct siw_sqe *orqe;
 741        struct siw_wqe *wqe = NULL;
 742
 743        /* make sure ORQ indices are current */
 744        smp_mb();
 745
 746        orqe = orq_get_current(qp);
 747        if (READ_ONCE(orqe->flags) & SIW_WQE_VALID) {
 748                /* RRESP is a TAGGED RDMAP operation */
 749                wqe = rx_wqe(&qp->rx_tagged);
 750                wqe->sqe.id = orqe->id;
 751                wqe->sqe.opcode = orqe->opcode;
 752                wqe->sqe.sge[0].laddr = orqe->sge[0].laddr;
 753                wqe->sqe.sge[0].lkey = orqe->sge[0].lkey;
 754                wqe->sqe.sge[0].length = orqe->sge[0].length;
 755                wqe->sqe.flags = orqe->flags;
 756                wqe->sqe.num_sge = 1;
 757                wqe->bytes = orqe->sge[0].length;
 758                wqe->processed = 0;
 759                wqe->mem[0] = NULL;
 760                /* make sure WQE is completely written before valid */
 761                smp_wmb();
 762                wqe->wr_status = SIW_WR_INPROGRESS;
 763
 764                return 0;
 765        }
 766        return -EPROTO;
 767}
 768
 769/*
 770 * siw_proc_rresp:
 771 *
 772 * Place incoming RRESP data into memory referenced by RREQ WQE
 773 * which is at the tip of the ORQ
 774 *
 775 * Function supports partially received RRESP's (suspending/resuming
 776 * current receive processing)
 777 */
 778int siw_proc_rresp(struct siw_qp *qp)
 779{
 780        struct siw_rx_stream *srx = &qp->rx_stream;
 781        struct siw_rx_fpdu *frx = &qp->rx_tagged;
 782        struct siw_wqe *wqe = rx_wqe(frx);
 783        struct siw_mem **mem, *mem_p;
 784        struct siw_sge *sge;
 785        int bytes, rv;
 786
 787        if (frx->first_ddp_seg) {
 788                if (unlikely(wqe->wr_status != SIW_WR_IDLE)) {
 789                        pr_warn("siw: [QP %u]: proc RRESP: status %d, op %d\n",
 790                                qp_id(qp), wqe->wr_status, wqe->sqe.opcode);
 791                        rv = -EPROTO;
 792                        goto error_term;
 793                }
 794                /*
 795                 * fetch pending RREQ from orq
 796                 */
 797                rv = siw_orqe_start_rx(qp);
 798                if (rv) {
 799                        pr_warn("siw: [QP %u]: ORQ empty at idx %d\n",
 800                                qp_id(qp), qp->orq_get % qp->attrs.orq_size);
 801                        goto error_term;
 802                }
 803                rv = siw_rresp_check_ntoh(srx, frx);
 804                if (unlikely(rv)) {
 805                        siw_qp_event(qp, IB_EVENT_QP_FATAL);
 806                        return rv;
 807                }
 808        } else {
 809                if (unlikely(wqe->wr_status != SIW_WR_INPROGRESS)) {
 810                        pr_warn("siw: [QP %u]: resume RRESP: status %d\n",
 811                                qp_id(qp), wqe->wr_status);
 812                        rv = -EPROTO;
 813                        goto error_term;
 814                }
 815        }
 816        if (!srx->fpdu_part_rem) /* zero length RRESPONSE */
 817                return 0;
 818
 819        sge = wqe->sqe.sge; /* there is only one */
 820        mem = &wqe->mem[0];
 821
 822        if (!(*mem)) {
 823                /*
 824                 * check target memory which resolves memory on first fragment
 825                 */
 826                rv = siw_check_sge(qp->pd, sge, mem, IB_ACCESS_LOCAL_WRITE, 0,
 827                                   wqe->bytes);
 828                if (unlikely(rv)) {
 829                        siw_dbg_qp(qp, "target mem check: %d\n", rv);
 830                        wqe->wc_status = SIW_WC_LOC_PROT_ERR;
 831
 832                        siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
 833                                           DDP_ETYPE_TAGGED_BUF,
 834                                           siw_tagged_error(-rv), 0);
 835
 836                        siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
 837
 838                        return -EINVAL;
 839                }
 840        }
 841        mem_p = *mem;
 842
 843        bytes = min(srx->fpdu_part_rem, srx->skb_new);
 844
 845        if (mem_p->mem_obj == NULL)
 846                rv = siw_rx_kva(srx,
 847                        (void *)(uintptr_t)(sge->laddr + wqe->processed),
 848                        bytes);
 849        else if (!mem_p->is_pbl)
 850                rv = siw_rx_umem(srx, mem_p->umem, sge->laddr + wqe->processed,
 851                                 bytes);
 852        else
 853                rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p,
 854                                sge->laddr + wqe->processed, bytes);
 855        if (rv != bytes) {
 856                wqe->wc_status = SIW_WC_GENERAL_ERR;
 857                rv = -EINVAL;
 858                goto error_term;
 859        }
 860        srx->fpdu_part_rem -= rv;
 861        srx->fpdu_part_rcvd += rv;
 862        wqe->processed += rv;
 863
 864        if (!srx->fpdu_part_rem) {
 865                srx->ddp_to += srx->fpdu_part_rcvd;
 866                return 0;
 867        }
 868        return -EAGAIN;
 869
 870error_term:
 871        siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, DDP_ETYPE_CATASTROPHIC,
 872                           DDP_ECODE_CATASTROPHIC, 0);
 873        return rv;
 874}
 875
 876int siw_proc_terminate(struct siw_qp *qp)
 877{
 878        struct siw_rx_stream *srx = &qp->rx_stream;
 879        struct sk_buff *skb = srx->skb;
 880        struct iwarp_terminate *term = &srx->hdr.terminate;
 881        union iwarp_hdr term_info;
 882        u8 *infop = (u8 *)&term_info;
 883        enum rdma_opcode op;
 884        u16 to_copy = sizeof(struct iwarp_ctrl);
 885
 886        pr_warn("siw: got TERMINATE. layer %d, type %d, code %d\n",
 887                __rdmap_term_layer(term), __rdmap_term_etype(term),
 888                __rdmap_term_ecode(term));
 889
 890        if (be32_to_cpu(term->ddp_qn) != RDMAP_UNTAGGED_QN_TERMINATE ||
 891            be32_to_cpu(term->ddp_msn) !=
 892                    qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] ||
 893            be32_to_cpu(term->ddp_mo) != 0) {
 894                pr_warn("siw: rx bogus TERM [QN x%08x, MSN x%08x, MO x%08x]\n",
 895                        be32_to_cpu(term->ddp_qn), be32_to_cpu(term->ddp_msn),
 896                        be32_to_cpu(term->ddp_mo));
 897                return -ECONNRESET;
 898        }
 899        /*
 900         * Receive remaining pieces of TERM if indicated
 901         */
 902        if (!term->flag_m)
 903                return -ECONNRESET;
 904
 905        /* Do not take the effort to reassemble a network fragmented
 906         * TERM message
 907         */
 908        if (srx->skb_new < sizeof(struct iwarp_ctrl_tagged))
 909                return -ECONNRESET;
 910
 911        memset(infop, 0, sizeof(term_info));
 912
 913        skb_copy_bits(skb, srx->skb_offset, infop, to_copy);
 914
 915        op = __rdmap_get_opcode(&term_info.ctrl);
 916        if (op >= RDMAP_TERMINATE)
 917                goto out;
 918
 919        infop += to_copy;
 920        srx->skb_offset += to_copy;
 921        srx->skb_new -= to_copy;
 922        srx->skb_copied += to_copy;
 923        srx->fpdu_part_rcvd += to_copy;
 924        srx->fpdu_part_rem -= to_copy;
 925
 926        to_copy = iwarp_pktinfo[op].hdr_len - to_copy;
 927
 928        /* Again, no network fragmented TERM's */
 929        if (to_copy + MPA_CRC_SIZE > srx->skb_new)
 930                return -ECONNRESET;
 931
 932        skb_copy_bits(skb, srx->skb_offset, infop, to_copy);
 933
 934        if (term->flag_r) {
 935                siw_dbg_qp(qp, "TERM reports RDMAP hdr type %u, len %u (%s)\n",
 936                           op, be16_to_cpu(term_info.ctrl.mpa_len),
 937                           term->flag_m ? "valid" : "invalid");
 938        } else if (term->flag_d) {
 939                siw_dbg_qp(qp, "TERM reports DDP hdr type %u, len %u (%s)\n",
 940                           op, be16_to_cpu(term_info.ctrl.mpa_len),
 941                           term->flag_m ? "valid" : "invalid");
 942        }
 943out:
 944        srx->skb_new -= to_copy;
 945        srx->skb_offset += to_copy;
 946        srx->skb_copied += to_copy;
 947        srx->fpdu_part_rcvd += to_copy;
 948        srx->fpdu_part_rem -= to_copy;
 949
 950        return -ECONNRESET;
 951}
 952
 953static int siw_get_trailer(struct siw_qp *qp, struct siw_rx_stream *srx)
 954{
 955        struct sk_buff *skb = srx->skb;
 956        u8 *tbuf = (u8 *)&srx->trailer.crc - srx->pad;
 957        __wsum crc_in, crc_own = 0;
 958
 959        siw_dbg_qp(qp, "expected %d, available %d, pad %u\n",
 960                   srx->fpdu_part_rem, srx->skb_new, srx->pad);
 961
 962        if (srx->skb_new < srx->fpdu_part_rem)
 963                return -EAGAIN;
 964
 965        skb_copy_bits(skb, srx->skb_offset, tbuf, srx->fpdu_part_rem);
 966
 967        if (srx->mpa_crc_hd && srx->pad)
 968                crypto_shash_update(srx->mpa_crc_hd, tbuf, srx->pad);
 969
 970        srx->skb_new -= srx->fpdu_part_rem;
 971        srx->skb_offset += srx->fpdu_part_rem;
 972        srx->skb_copied += srx->fpdu_part_rem;
 973
 974        if (!srx->mpa_crc_hd)
 975                return 0;
 976
 977        /*
 978         * CRC32 is computed, transmitted and received directly in NBO,
 979         * so there's never a reason to convert byte order.
 980         */
 981        crypto_shash_final(srx->mpa_crc_hd, (u8 *)&crc_own);
 982        crc_in = (__force __wsum)srx->trailer.crc;
 983
 984        if (unlikely(crc_in != crc_own)) {
 985                pr_warn("siw: crc error. in: %08x, own %08x, op %u\n",
 986                        crc_in, crc_own, qp->rx_stream.rdmap_op);
 987
 988                siw_init_terminate(qp, TERM_ERROR_LAYER_LLP,
 989                                   LLP_ETYPE_MPA,
 990                                   LLP_ECODE_RECEIVED_CRC, 0);
 991                return -EINVAL;
 992        }
 993        return 0;
 994}
 995
 996#define MIN_DDP_HDR sizeof(struct iwarp_ctrl_tagged)
 997
 998static int siw_get_hdr(struct siw_rx_stream *srx)
 999{
1000        struct sk_buff *skb = srx->skb;
1001        struct siw_qp *qp = rx_qp(srx);
1002        struct iwarp_ctrl *c_hdr = &srx->hdr.ctrl;
1003        struct siw_rx_fpdu *frx;
1004        u8 opcode;
1005        int bytes;
1006
1007        if (srx->fpdu_part_rcvd < MIN_DDP_HDR) {
1008                /*
1009                 * copy a mimimum sized (tagged) DDP frame control part
1010                 */
1011                bytes = min_t(int, srx->skb_new,
1012                              MIN_DDP_HDR - srx->fpdu_part_rcvd);
1013
1014                skb_copy_bits(skb, srx->skb_offset,
1015                              (char *)c_hdr + srx->fpdu_part_rcvd, bytes);
1016
1017                srx->fpdu_part_rcvd += bytes;
1018
1019                srx->skb_new -= bytes;
1020                srx->skb_offset += bytes;
1021                srx->skb_copied += bytes;
1022
1023                if (srx->fpdu_part_rcvd < MIN_DDP_HDR)
1024                        return -EAGAIN;
1025
1026                if (unlikely(__ddp_get_version(c_hdr) != DDP_VERSION)) {
1027                        enum ddp_etype etype;
1028                        enum ddp_ecode ecode;
1029
1030                        pr_warn("siw: received ddp version unsupported %d\n",
1031                                __ddp_get_version(c_hdr));
1032
1033                        if (c_hdr->ddp_rdmap_ctrl & DDP_FLAG_TAGGED) {
1034                                etype = DDP_ETYPE_TAGGED_BUF;
1035                                ecode = DDP_ECODE_T_VERSION;
1036                        } else {
1037                                etype = DDP_ETYPE_UNTAGGED_BUF;
1038                                ecode = DDP_ECODE_UT_VERSION;
1039                        }
1040                        siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
1041                                           etype, ecode, 0);
1042                        return -EINVAL;
1043                }
1044                if (unlikely(__rdmap_get_version(c_hdr) != RDMAP_VERSION)) {
1045                        pr_warn("siw: received rdmap version unsupported %d\n",
1046                                __rdmap_get_version(c_hdr));
1047
1048                        siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP,
1049                                           RDMAP_ETYPE_REMOTE_OPERATION,
1050                                           RDMAP_ECODE_VERSION, 0);
1051                        return -EINVAL;
1052                }
1053                opcode = __rdmap_get_opcode(c_hdr);
1054
1055                if (opcode > RDMAP_TERMINATE) {
1056                        pr_warn("siw: received unknown packet type %u\n",
1057                                opcode);
1058
1059                        siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP,
1060                                           RDMAP_ETYPE_REMOTE_OPERATION,
1061                                           RDMAP_ECODE_OPCODE, 0);
1062                        return -EINVAL;
1063                }
1064                siw_dbg_qp(rx_qp(srx), "new header, opcode %u\n", opcode);
1065        } else {
1066                opcode = __rdmap_get_opcode(c_hdr);
1067        }
1068        set_rx_fpdu_context(qp, opcode);
1069        frx = qp->rx_fpdu;
1070
1071        /*
1072         * Figure out len of current hdr: variable length of
1073         * iwarp hdr may force us to copy hdr information in
1074         * two steps. Only tagged DDP messages are already
1075         * completely received.
1076         */
1077        if (iwarp_pktinfo[opcode].hdr_len > sizeof(struct iwarp_ctrl_tagged)) {
1078                bytes = iwarp_pktinfo[opcode].hdr_len - MIN_DDP_HDR;
1079
1080                if (srx->skb_new < bytes)
1081                        return -EAGAIN;
1082
1083                skb_copy_bits(skb, srx->skb_offset,
1084                              (char *)c_hdr + srx->fpdu_part_rcvd, bytes);
1085
1086                srx->fpdu_part_rcvd += bytes;
1087
1088                srx->skb_new -= bytes;
1089                srx->skb_offset += bytes;
1090                srx->skb_copied += bytes;
1091        }
1092
1093        /*
1094         * DDP/RDMAP header receive completed. Check if the current
1095         * DDP segment starts a new RDMAP message or continues a previously
1096         * started RDMAP message.
1097         *
1098         * Alternating reception of DDP segments (or FPDUs) from incomplete
1099         * tagged and untagged RDMAP messages is supported, as long as
1100         * the current tagged or untagged message gets eventually completed
1101         * w/o intersection from another message of the same type
1102         * (tagged/untagged). E.g., a WRITE can get intersected by a SEND,
1103         * but not by a READ RESPONSE etc.
1104         */
1105        if (srx->mpa_crc_hd) {
1106                /*
1107                 * Restart CRC computation
1108                 */
1109                crypto_shash_init(srx->mpa_crc_hd);
1110                crypto_shash_update(srx->mpa_crc_hd, (u8 *)c_hdr,
1111                                    srx->fpdu_part_rcvd);
1112        }
1113        if (frx->more_ddp_segs) {
1114                frx->first_ddp_seg = 0;
1115                if (frx->prev_rdmap_op != opcode) {
1116                        pr_warn("siw: packet intersection: %u : %u\n",
1117                                frx->prev_rdmap_op, opcode);
1118                        /*
1119                         * The last inbound RDMA operation of same type
1120                         * (tagged or untagged) is left unfinished.
1121                         * To complete it in error, make it the current
1122                         * operation again, even with the header already
1123                         * overwritten. For error handling, only the opcode
1124                         * and current rx context are relevant.
1125                         */
1126                        set_rx_fpdu_context(qp, frx->prev_rdmap_op);
1127                        __rdmap_set_opcode(c_hdr, frx->prev_rdmap_op);
1128                        return -EPROTO;
1129                }
1130        } else {
1131                frx->prev_rdmap_op = opcode;
1132                frx->first_ddp_seg = 1;
1133        }
1134        frx->more_ddp_segs = c_hdr->ddp_rdmap_ctrl & DDP_FLAG_LAST ? 0 : 1;
1135
1136        return 0;
1137}
1138
1139static int siw_check_tx_fence(struct siw_qp *qp)
1140{
1141        struct siw_wqe *tx_waiting = tx_wqe(qp);
1142        struct siw_sqe *rreq;
1143        int resume_tx = 0, rv = 0;
1144        unsigned long flags;
1145
1146        spin_lock_irqsave(&qp->orq_lock, flags);
1147
1148        rreq = orq_get_current(qp);
1149
1150        /* free current orq entry */
1151        WRITE_ONCE(rreq->flags, 0);
1152
1153        if (qp->tx_ctx.orq_fence) {
1154                if (unlikely(tx_waiting->wr_status != SIW_WR_QUEUED)) {
1155                        pr_warn("siw: [QP %u]: fence resume: bad status %d\n",
1156                                qp_id(qp), tx_waiting->wr_status);
1157                        rv = -EPROTO;
1158                        goto out;
1159                }
1160                /* resume SQ processing */
1161                if (tx_waiting->sqe.opcode == SIW_OP_READ ||
1162                    tx_waiting->sqe.opcode == SIW_OP_READ_LOCAL_INV) {
1163                        rreq = orq_get_tail(qp);
1164                        if (unlikely(!rreq)) {
1165                                pr_warn("siw: [QP %u]: no ORQE\n", qp_id(qp));
1166                                rv = -EPROTO;
1167                                goto out;
1168                        }
1169                        siw_read_to_orq(rreq, &tx_waiting->sqe);
1170
1171                        qp->orq_put++;
1172                        qp->tx_ctx.orq_fence = 0;
1173                        resume_tx = 1;
1174
1175                } else if (siw_orq_empty(qp)) {
1176                        qp->tx_ctx.orq_fence = 0;
1177                        resume_tx = 1;
1178                } else {
1179                        pr_warn("siw: [QP %u]: fence resume: orq idx: %d:%d\n",
1180                                qp_id(qp), qp->orq_get, qp->orq_put);
1181                        rv = -EPROTO;
1182                }
1183        }
1184        qp->orq_get++;
1185out:
1186        spin_unlock_irqrestore(&qp->orq_lock, flags);
1187
1188        if (resume_tx)
1189                rv = siw_sq_start(qp);
1190
1191        return rv;
1192}
1193
1194/*
1195 * siw_rdmap_complete()
1196 *
1197 * Complete processing of an RDMA message after receiving all
1198 * DDP segmens or ABort processing after encountering error case.
1199 *
1200 *   o SENDs + RRESPs will need for completion,
1201 *   o RREQs need for  READ RESPONSE initialization
1202 *   o WRITEs need memory dereferencing
1203 *
1204 * TODO: Failed WRITEs need local error to be surfaced.
1205 */
1206static int siw_rdmap_complete(struct siw_qp *qp, int error)
1207{
1208        struct siw_rx_stream *srx = &qp->rx_stream;
1209        struct siw_wqe *wqe = rx_wqe(qp->rx_fpdu);
1210        enum siw_wc_status wc_status = wqe->wc_status;
1211        u8 opcode = __rdmap_get_opcode(&srx->hdr.ctrl);
1212        int rv = 0;
1213
1214        switch (opcode) {
1215        case RDMAP_SEND_SE:
1216        case RDMAP_SEND_SE_INVAL:
1217                wqe->rqe.flags |= SIW_WQE_SOLICITED;
1218                fallthrough;
1219
1220        case RDMAP_SEND:
1221        case RDMAP_SEND_INVAL:
1222                if (wqe->wr_status == SIW_WR_IDLE)
1223                        break;
1224
1225                srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]++;
1226
1227                if (error != 0 && wc_status == SIW_WC_SUCCESS)
1228                        wc_status = SIW_WC_GENERAL_ERR;
1229                /*
1230                 * Handle STag invalidation request
1231                 */
1232                if (wc_status == SIW_WC_SUCCESS &&
1233                    (opcode == RDMAP_SEND_INVAL ||
1234                     opcode == RDMAP_SEND_SE_INVAL)) {
1235                        rv = siw_invalidate_stag(qp->pd, srx->inval_stag);
1236                        if (rv) {
1237                                siw_init_terminate(
1238                                        qp, TERM_ERROR_LAYER_RDMAP,
1239                                        rv == -EACCES ?
1240                                                RDMAP_ETYPE_REMOTE_PROTECTION :
1241                                                RDMAP_ETYPE_REMOTE_OPERATION,
1242                                        RDMAP_ECODE_CANNOT_INVALIDATE, 0);
1243
1244                                wc_status = SIW_WC_REM_INV_REQ_ERR;
1245                        }
1246                        rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed,
1247                                              rv ? 0 : srx->inval_stag,
1248                                              wc_status);
1249                } else {
1250                        rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed,
1251                                              0, wc_status);
1252                }
1253                siw_wqe_put_mem(wqe, SIW_OP_RECEIVE);
1254                break;
1255
1256        case RDMAP_RDMA_READ_RESP:
1257                if (wqe->wr_status == SIW_WR_IDLE)
1258                        break;
1259
1260                if (error != 0) {
1261                        if ((srx->state == SIW_GET_HDR &&
1262                             qp->rx_fpdu->first_ddp_seg) || error == -ENODATA)
1263                                /* possible RREQ in ORQ left untouched */
1264                                break;
1265
1266                        if (wc_status == SIW_WC_SUCCESS)
1267                                wc_status = SIW_WC_GENERAL_ERR;
1268                } else if (rdma_is_kernel_res(&qp->base_qp.res) &&
1269                           rx_type(wqe) == SIW_OP_READ_LOCAL_INV) {
1270                        /*
1271                         * Handle any STag invalidation request
1272                         */
1273                        rv = siw_invalidate_stag(qp->pd, wqe->sqe.sge[0].lkey);
1274                        if (rv) {
1275                                siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
1276                                                   RDMAP_ETYPE_CATASTROPHIC,
1277                                                   RDMAP_ECODE_UNSPECIFIED, 0);
1278
1279                                if (wc_status == SIW_WC_SUCCESS) {
1280                                        wc_status = SIW_WC_GENERAL_ERR;
1281                                        error = rv;
1282                                }
1283                        }
1284                }
1285                /*
1286                 * All errors turn the wqe into signalled.
1287                 */
1288                if ((wqe->sqe.flags & SIW_WQE_SIGNALLED) || error != 0)
1289                        rv = siw_sqe_complete(qp, &wqe->sqe, wqe->processed,
1290                                              wc_status);
1291                siw_wqe_put_mem(wqe, SIW_OP_READ);
1292
1293                if (!error)
1294                        rv = siw_check_tx_fence(qp);
1295                else
1296                        /* Disable current ORQ eleement */
1297                        WRITE_ONCE(orq_get_current(qp)->flags, 0);
1298                break;
1299
1300        case RDMAP_RDMA_READ_REQ:
1301                if (!error) {
1302                        rv = siw_init_rresp(qp, srx);
1303                        srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]++;
1304                }
1305                break;
1306
1307        case RDMAP_RDMA_WRITE:
1308                if (wqe->wr_status == SIW_WR_IDLE)
1309                        break;
1310
1311                /*
1312                 * Free References from memory object if
1313                 * attached to receive context (inbound WRITE).
1314                 * While a zero-length WRITE is allowed,
1315                 * no memory reference got created.
1316                 */
1317                if (rx_mem(&qp->rx_tagged)) {
1318                        siw_mem_put(rx_mem(&qp->rx_tagged));
1319                        rx_mem(&qp->rx_tagged) = NULL;
1320                }
1321                break;
1322
1323        default:
1324                break;
1325        }
1326        wqe->wr_status = SIW_WR_IDLE;
1327
1328        return rv;
1329}
1330
1331/*
1332 * siw_tcp_rx_data()
1333 *
1334 * Main routine to consume inbound TCP payload
1335 *
1336 * @rd_desc:    read descriptor
1337 * @skb:        socket buffer
1338 * @off:        offset in skb
1339 * @len:        skb->len - offset : payload in skb
1340 */
1341int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb,
1342                    unsigned int off, size_t len)
1343{
1344        struct siw_qp *qp = rd_desc->arg.data;
1345        struct siw_rx_stream *srx = &qp->rx_stream;
1346        int rv;
1347
1348        srx->skb = skb;
1349        srx->skb_new = skb->len - off;
1350        srx->skb_offset = off;
1351        srx->skb_copied = 0;
1352
1353        siw_dbg_qp(qp, "new data, len %d\n", srx->skb_new);
1354
1355        while (srx->skb_new) {
1356                int run_completion = 1;
1357
1358                if (unlikely(srx->rx_suspend)) {
1359                        /* Do not process any more data */
1360                        srx->skb_copied += srx->skb_new;
1361                        break;
1362                }
1363                switch (srx->state) {
1364                case SIW_GET_HDR:
1365                        rv = siw_get_hdr(srx);
1366                        if (!rv) {
1367                                srx->fpdu_part_rem =
1368                                        be16_to_cpu(srx->hdr.ctrl.mpa_len) -
1369                                        srx->fpdu_part_rcvd + MPA_HDR_SIZE;
1370
1371                                if (srx->fpdu_part_rem)
1372                                        srx->pad = -srx->fpdu_part_rem & 0x3;
1373                                else
1374                                        srx->pad = 0;
1375
1376                                srx->state = SIW_GET_DATA_START;
1377                                srx->fpdu_part_rcvd = 0;
1378                        }
1379                        break;
1380
1381                case SIW_GET_DATA_MORE:
1382                        /*
1383                         * Another data fragment of the same DDP segment.
1384                         * Setting first_ddp_seg = 0 avoids repeating
1385                         * initializations that shall occur only once per
1386                         * DDP segment.
1387                         */
1388                        qp->rx_fpdu->first_ddp_seg = 0;
1389                        fallthrough;
1390
1391                case SIW_GET_DATA_START:
1392                        /*
1393                         * Headers will be checked by the opcode-specific
1394                         * data receive function below.
1395                         */
1396                        rv = iwarp_pktinfo[qp->rx_stream.rdmap_op].rx_data(qp);
1397                        if (!rv) {
1398                                int mpa_len =
1399                                        be16_to_cpu(srx->hdr.ctrl.mpa_len)
1400                                        + MPA_HDR_SIZE;
1401
1402                                srx->fpdu_part_rem = (-mpa_len & 0x3)
1403                                                      + MPA_CRC_SIZE;
1404                                srx->fpdu_part_rcvd = 0;
1405                                srx->state = SIW_GET_TRAILER;
1406                        } else {
1407                                if (unlikely(rv == -ECONNRESET))
1408                                        run_completion = 0;
1409                                else
1410                                        srx->state = SIW_GET_DATA_MORE;
1411                        }
1412                        break;
1413
1414                case SIW_GET_TRAILER:
1415                        /*
1416                         * read CRC + any padding
1417                         */
1418                        rv = siw_get_trailer(qp, srx);
1419                        if (likely(!rv)) {
1420                                /*
1421                                 * FPDU completed.
1422                                 * complete RDMAP message if last fragment
1423                                 */
1424                                srx->state = SIW_GET_HDR;
1425                                srx->fpdu_part_rcvd = 0;
1426
1427                                if (!(srx->hdr.ctrl.ddp_rdmap_ctrl &
1428                                      DDP_FLAG_LAST))
1429                                        /* more frags */
1430                                        break;
1431
1432                                rv = siw_rdmap_complete(qp, 0);
1433                                run_completion = 0;
1434                        }
1435                        break;
1436
1437                default:
1438                        pr_warn("QP[%u]: RX out of state\n", qp_id(qp));
1439                        rv = -EPROTO;
1440                        run_completion = 0;
1441                }
1442                if (unlikely(rv != 0 && rv != -EAGAIN)) {
1443                        if ((srx->state > SIW_GET_HDR ||
1444                             qp->rx_fpdu->more_ddp_segs) && run_completion)
1445                                siw_rdmap_complete(qp, rv);
1446
1447                        siw_dbg_qp(qp, "rx error %d, rx state %d\n", rv,
1448                                   srx->state);
1449
1450                        siw_qp_cm_drop(qp, 1);
1451
1452                        break;
1453                }
1454                if (rv) {
1455                        siw_dbg_qp(qp, "fpdu fragment, state %d, missing %d\n",
1456                                   srx->state, srx->fpdu_part_rem);
1457                        break;
1458                }
1459        }
1460        return srx->skb_copied;
1461}
1462