dpdk/drivers/net/mlx4/mlx4_rxtx.c
<<
>>
Prefs
   1/* SPDX-License-Identifier: BSD-3-Clause
   2 * Copyright 2017 6WIND S.A.
   3 * Copyright 2017 Mellanox Technologies, Ltd
   4 */
   5
   6/**
   7 * @file
   8 * Data plane functions for mlx4 driver.
   9 */
  10
  11#include <stdbool.h>
  12#include <stdint.h>
  13#include <string.h>
  14
  15/* Verbs headers do not support -pedantic. */
  16#ifdef PEDANTIC
  17#pragma GCC diagnostic ignored "-Wpedantic"
  18#endif
  19#include <infiniband/verbs.h>
  20#ifdef PEDANTIC
  21#pragma GCC diagnostic error "-Wpedantic"
  22#endif
  23
  24#include <rte_branch_prediction.h>
  25#include <rte_common.h>
  26#include <rte_io.h>
  27#include <rte_mbuf.h>
  28#include <rte_mempool.h>
  29#include <rte_prefetch.h>
  30
  31#include "mlx4.h"
  32#include "mlx4_prm.h"
  33#include "mlx4_rxtx.h"
  34#include "mlx4_utils.h"
  35
  36/**
  37 * Pointer-value pair structure used in tx_post_send for saving the first
  38 * DWORD (32 byte) of a TXBB.
  39 */
  40struct pv {
  41        union {
  42                volatile struct mlx4_wqe_data_seg *dseg;
  43                volatile uint32_t *dst;
  44        };
  45        uint32_t val;
  46};
  47
  48/** A helper structure for TSO packet handling. */
  49struct tso_info {
  50        /** Pointer to the array of saved first DWORD (32 byte) of a TXBB. */
  51        struct pv *pv;
  52        /** Current entry in the pv array. */
  53        int pv_counter;
  54        /** Total size of the WQE including padding. */
  55        uint32_t wqe_size;
  56        /** Size of TSO header to prepend to each packet to send. */
  57        uint16_t tso_header_size;
  58        /** Total size of the TSO segment in the WQE. */
  59        uint16_t wqe_tso_seg_size;
  60        /** Raw WQE size in units of 16 Bytes and without padding. */
  61        uint8_t fence_size;
  62};
  63
  64/** A table to translate Rx completion flags to packet type. */
  65uint32_t mlx4_ptype_table[0x100] __rte_cache_aligned = {
  66        /*
  67         * The index to the array should have:
  68         *  bit[7] - MLX4_CQE_L2_TUNNEL
  69         *  bit[6] - MLX4_CQE_L2_TUNNEL_IPV4
  70         *  bit[5] - MLX4_CQE_STATUS_UDP
  71         *  bit[4] - MLX4_CQE_STATUS_TCP
  72         *  bit[3] - MLX4_CQE_STATUS_IPV4OPT
  73         *  bit[2] - MLX4_CQE_STATUS_IPV6
  74         *  bit[1] - MLX4_CQE_STATUS_IPF
  75         *  bit[0] - MLX4_CQE_STATUS_IPV4
  76         * giving a total of up to 256 entries.
  77         */
  78        /* L2 */
  79        [0x00] = RTE_PTYPE_L2_ETHER,
  80        /* L3 */
  81        [0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
  82                     RTE_PTYPE_L4_NONFRAG,
  83        [0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
  84                     RTE_PTYPE_L4_FRAG,
  85        [0x03] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
  86                     RTE_PTYPE_L4_FRAG,
  87        [0x04] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
  88                     RTE_PTYPE_L4_NONFRAG,
  89        [0x06] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
  90                     RTE_PTYPE_L4_FRAG,
  91        [0x08] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
  92                     RTE_PTYPE_L4_NONFRAG,
  93        [0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
  94                     RTE_PTYPE_L4_NONFRAG,
  95        [0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
  96                     RTE_PTYPE_L4_FRAG,
  97        [0x0b] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
  98                     RTE_PTYPE_L4_FRAG,
  99        /* TCP */
 100        [0x11] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
 101                     RTE_PTYPE_L4_TCP,
 102        [0x14] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
 103                     RTE_PTYPE_L4_TCP,
 104        [0x16] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
 105                     RTE_PTYPE_L4_FRAG,
 106        [0x18] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
 107                     RTE_PTYPE_L4_TCP,
 108        [0x19] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
 109                     RTE_PTYPE_L4_TCP,
 110        /* UDP */
 111        [0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
 112                     RTE_PTYPE_L4_UDP,
 113        [0x24] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
 114                     RTE_PTYPE_L4_UDP,
 115        [0x26] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
 116                     RTE_PTYPE_L4_FRAG,
 117        [0x28] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
 118                     RTE_PTYPE_L4_UDP,
 119        [0x29] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT |
 120                     RTE_PTYPE_L4_UDP,
 121        /* Tunneled - L3 IPV6 */
 122        [0x80] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
 123        [0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
 124                     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
 125                     RTE_PTYPE_INNER_L4_NONFRAG,
 126        [0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
 127                     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
 128                     RTE_PTYPE_INNER_L4_FRAG,
 129        [0x83] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
 130                     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
 131                     RTE_PTYPE_INNER_L4_FRAG,
 132        [0x84] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
 133                     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
 134                     RTE_PTYPE_INNER_L4_NONFRAG,
 135        [0x86] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
 136                     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
 137                     RTE_PTYPE_INNER_L4_FRAG,
 138        [0x88] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
 139                     RTE_PTYPE_INNER_L3_IPV4_EXT |
 140                     RTE_PTYPE_INNER_L4_NONFRAG,
 141        [0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
 142                     RTE_PTYPE_INNER_L3_IPV4_EXT |
 143                     RTE_PTYPE_INNER_L4_NONFRAG,
 144        [0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
 145                     RTE_PTYPE_INNER_L3_IPV4_EXT |
 146                     RTE_PTYPE_INNER_L4_FRAG,
 147        [0x8b] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
 148                     RTE_PTYPE_INNER_L3_IPV4_EXT |
 149                     RTE_PTYPE_INNER_L4_FRAG,
 150        /* Tunneled - L3 IPV6, TCP */
 151        [0x91] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
 152                     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
 153                     RTE_PTYPE_INNER_L4_TCP,
 154        [0x94] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
 155                     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
 156                     RTE_PTYPE_INNER_L4_TCP,
 157        [0x96] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
 158                     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
 159                     RTE_PTYPE_INNER_L4_FRAG,
 160        [0x98] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
 161                     RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_TCP,
 162        [0x99] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
 163                     RTE_PTYPE_INNER_L3_IPV4_EXT | RTE_PTYPE_INNER_L4_TCP,
 164        /* Tunneled - L3 IPV6, UDP */
 165        [0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
 166                     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
 167                     RTE_PTYPE_INNER_L4_UDP,
 168        [0xa4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
 169                     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
 170                     RTE_PTYPE_INNER_L4_UDP,
 171        [0xa6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
 172                     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
 173                     RTE_PTYPE_INNER_L4_FRAG,
 174        [0xa8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
 175                     RTE_PTYPE_INNER_L3_IPV4_EXT |
 176                     RTE_PTYPE_INNER_L4_UDP,
 177        [0xa9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
 178                     RTE_PTYPE_INNER_L3_IPV4_EXT |
 179                     RTE_PTYPE_INNER_L4_UDP,
 180        /* Tunneled - L3 IPV4 */
 181        [0xc0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
 182        [0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
 183                     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
 184                     RTE_PTYPE_INNER_L4_NONFRAG,
 185        [0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
 186                     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
 187                     RTE_PTYPE_INNER_L4_FRAG,
 188        [0xc3] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
 189                     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
 190                     RTE_PTYPE_INNER_L4_FRAG,
 191        [0xc4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
 192                     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
 193                     RTE_PTYPE_INNER_L4_NONFRAG,
 194        [0xc6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
 195                     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
 196                     RTE_PTYPE_INNER_L4_FRAG,
 197        [0xc8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
 198                     RTE_PTYPE_INNER_L3_IPV4_EXT |
 199                     RTE_PTYPE_INNER_L4_NONFRAG,
 200        [0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
 201                     RTE_PTYPE_INNER_L3_IPV4_EXT |
 202                     RTE_PTYPE_INNER_L4_NONFRAG,
 203        [0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
 204                     RTE_PTYPE_INNER_L3_IPV4_EXT |
 205                     RTE_PTYPE_INNER_L4_FRAG,
 206        [0xcb] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
 207                     RTE_PTYPE_INNER_L3_IPV4_EXT |
 208                     RTE_PTYPE_INNER_L4_FRAG,
 209        /* Tunneled - L3 IPV4, TCP */
 210        [0xd1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
 211                     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
 212                     RTE_PTYPE_INNER_L4_TCP,
 213        [0xd4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
 214                     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
 215                     RTE_PTYPE_INNER_L4_TCP,
 216        [0xd6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
 217                     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
 218                     RTE_PTYPE_INNER_L4_FRAG,
 219        [0xd8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
 220                     RTE_PTYPE_INNER_L3_IPV4_EXT |
 221                     RTE_PTYPE_INNER_L4_TCP,
 222        [0xd9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
 223                     RTE_PTYPE_INNER_L3_IPV4_EXT |
 224                     RTE_PTYPE_INNER_L4_TCP,
 225        /* Tunneled - L3 IPV4, UDP */
 226        [0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
 227                     RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
 228                     RTE_PTYPE_INNER_L4_UDP,
 229        [0xe4] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
 230                     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
 231                     RTE_PTYPE_INNER_L4_UDP,
 232        [0xe6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
 233                     RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
 234                     RTE_PTYPE_INNER_L4_FRAG,
 235        [0xe8] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
 236                     RTE_PTYPE_INNER_L3_IPV4_EXT |
 237                     RTE_PTYPE_INNER_L4_UDP,
 238        [0xe9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
 239                     RTE_PTYPE_INNER_L3_IPV4_EXT |
 240                     RTE_PTYPE_INNER_L4_UDP,
 241};
 242
 243/**
 244 * Stamp TXBB burst so it won't be reused by the HW.
 245 *
 246 * Routine is used when freeing WQE used by the chip or when failing
 247 * building an WQ entry has failed leaving partial information on the queue.
 248 *
 249 * @param sq
 250 *   Pointer to the SQ structure.
 251 * @param start
 252 *   Pointer to the first TXBB to stamp.
 253 * @param end
 254 *   Pointer to the followed end TXBB to stamp.
 255 *
 256 * @return
 257 *   Stamping burst size in byte units.
 258 */
 259static uint32_t
 260mlx4_txq_stamp_freed_wqe(struct mlx4_sq *sq, volatile uint32_t *start,
 261                         volatile uint32_t *end)
 262{
 263        uint32_t stamp = sq->stamp;
 264        int32_t size = (intptr_t)end - (intptr_t)start;
 265
 266        MLX4_ASSERT(start != end);
 267        /* Hold SQ ring wrap around. */
 268        if (size < 0) {
 269                size = (int32_t)sq->size + size;
 270                do {
 271                        *start = stamp;
 272                        start += MLX4_SQ_STAMP_DWORDS;
 273                } while (start != (volatile uint32_t *)sq->eob);
 274                start = (volatile uint32_t *)sq->buf;
 275                /* Flip invalid stamping ownership. */
 276                stamp ^= RTE_BE32(1u << MLX4_SQ_OWNER_BIT);
 277                sq->stamp = stamp;
 278                if (start == end)
 279                        return size;
 280        }
 281        do {
 282                *start = stamp;
 283                start += MLX4_SQ_STAMP_DWORDS;
 284        } while (start != end);
 285        return (uint32_t)size;
 286}
 287
 288/**
 289 * Manage Tx completions.
 290 *
 291 * When sending a burst, mlx4_tx_burst() posts several WRs.
 292 * To improve performance, a completion event is only required once every
 293 * MLX4_PMD_TX_PER_COMP_REQ sends. Doing so discards completion information
 294 * for other WRs, but this information would not be used anyway.
 295 *
 296 * @param txq
 297 *   Pointer to Tx queue structure.
 298 * @param elts_m
 299 *   Tx elements number mask.
 300 * @param sq
 301 *   Pointer to the SQ structure.
 302 */
 303static void
 304mlx4_txq_complete(struct txq *txq, const unsigned int elts_m,
 305                  struct mlx4_sq *sq)
 306{
 307        unsigned int elts_tail = txq->elts_tail;
 308        struct mlx4_cq *cq = &txq->mcq;
 309        volatile struct mlx4_cqe *cqe;
 310        uint32_t completed;
 311        uint32_t cons_index = cq->cons_index;
 312        volatile uint32_t *first_txbb;
 313
 314        /*
 315         * Traverse over all CQ entries reported and handle each WQ entry
 316         * reported by them.
 317         */
 318        do {
 319                cqe = (volatile struct mlx4_cqe *)mlx4_get_cqe(cq, cons_index);
 320                if (unlikely(!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
 321                    !!(cons_index & cq->cqe_cnt)))
 322                        break;
 323#ifdef RTE_LIBRTE_MLX4_DEBUG
 324                /*
 325                 * Make sure we read the CQE after we read the ownership bit.
 326                 */
 327                rte_io_rmb();
 328                if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
 329                             MLX4_CQE_OPCODE_ERROR)) {
 330                        volatile struct mlx4_err_cqe *cqe_err =
 331                                (volatile struct mlx4_err_cqe *)cqe;
 332                        ERROR("%p CQE error - vendor syndrome: 0x%x"
 333                              " syndrome: 0x%x\n",
 334                              (void *)txq, cqe_err->vendor_err,
 335                              cqe_err->syndrome);
 336                        break;
 337                }
 338#endif /* RTE_LIBRTE_MLX4_DEBUG */
 339                cons_index++;
 340        } while (1);
 341        completed = (cons_index - cq->cons_index) * txq->elts_comp_cd_init;
 342        if (unlikely(!completed))
 343                return;
 344        /* First stamping address is the end of the last one. */
 345        first_txbb = (&(*txq->elts)[elts_tail & elts_m])->eocb;
 346        elts_tail += completed;
 347        /* The new tail element holds the end address. */
 348        sq->remain_size += mlx4_txq_stamp_freed_wqe(sq, first_txbb,
 349                (&(*txq->elts)[elts_tail & elts_m])->eocb);
 350        /* Update CQ consumer index. */
 351        cq->cons_index = cons_index;
 352        *cq->set_ci_db = rte_cpu_to_be_32(cons_index & MLX4_CQ_DB_CI_MASK);
 353        txq->elts_tail = elts_tail;
 354}
 355
 356/**
 357 * Write Tx data segment to the SQ.
 358 *
 359 * @param dseg
 360 *   Pointer to data segment in SQ.
 361 * @param lkey
 362 *   Memory region lkey.
 363 * @param addr
 364 *   Data address.
 365 * @param byte_count
 366 *   Big endian bytes count of the data to send.
 367 */
 368static inline void
 369mlx4_fill_tx_data_seg(volatile struct mlx4_wqe_data_seg *dseg,
 370                       uint32_t lkey, uintptr_t addr, rte_be32_t  byte_count)
 371{
 372        dseg->addr = rte_cpu_to_be_64(addr);
 373        dseg->lkey = lkey;
 374#if RTE_CACHE_LINE_SIZE < 64
 375        /*
 376         * Need a barrier here before writing the byte_count
 377         * fields to make sure that all the data is visible
 378         * before the byte_count field is set.
 379         * Otherwise, if the segment begins a new cacheline,
 380         * the HCA prefetcher could grab the 64-byte chunk and
 381         * get a valid (!= 0xffffffff) byte count but stale
 382         * data, and end up sending the wrong data.
 383         */
 384        rte_io_wmb();
 385#endif /* RTE_CACHE_LINE_SIZE */
 386        dseg->byte_count = byte_count;
 387}
 388
 389/**
 390 * Obtain and calculate TSO information needed for assembling a TSO WQE.
 391 *
 392 * @param buf
 393 *   Pointer to the first packet mbuf.
 394 * @param txq
 395 *   Pointer to Tx queue structure.
 396 * @param tinfo
 397 *   Pointer to a structure to fill the info with.
 398 *
 399 * @return
 400 *   0 on success, negative value upon error.
 401 */
 402static inline int
 403mlx4_tx_burst_tso_get_params(struct rte_mbuf *buf,
 404                             struct txq *txq,
 405                             struct tso_info *tinfo)
 406{
 407        struct mlx4_sq *sq = &txq->msq;
 408        const uint8_t tunneled = txq->priv->hw_csum_l2tun &&
 409                                 (buf->ol_flags & PKT_TX_TUNNEL_MASK);
 410
 411        tinfo->tso_header_size = buf->l2_len + buf->l3_len + buf->l4_len;
 412        if (tunneled)
 413                tinfo->tso_header_size +=
 414                                buf->outer_l2_len + buf->outer_l3_len;
 415        if (unlikely(buf->tso_segsz == 0 ||
 416                     tinfo->tso_header_size == 0 ||
 417                     tinfo->tso_header_size > MLX4_MAX_TSO_HEADER ||
 418                     tinfo->tso_header_size > buf->data_len))
 419                return -EINVAL;
 420        /*
 421         * Calculate the WQE TSO segment size
 422         * Note:
 423         * 1. An LSO segment must be padded such that the subsequent data
 424         *    segment is 16-byte aligned.
 425         * 2. The start address of the TSO segment is always 16 Bytes aligned.
 426         */
 427        tinfo->wqe_tso_seg_size = RTE_ALIGN(sizeof(struct mlx4_wqe_lso_seg) +
 428                                            tinfo->tso_header_size,
 429                                            sizeof(struct mlx4_wqe_data_seg));
 430        tinfo->fence_size = ((sizeof(struct mlx4_wqe_ctrl_seg) +
 431                             tinfo->wqe_tso_seg_size) >> MLX4_SEG_SHIFT) +
 432                             buf->nb_segs;
 433        tinfo->wqe_size =
 434                RTE_ALIGN((uint32_t)(tinfo->fence_size << MLX4_SEG_SHIFT),
 435                          MLX4_TXBB_SIZE);
 436        /* Validate WQE size and WQE space in the send queue. */
 437        if (sq->remain_size < tinfo->wqe_size ||
 438            tinfo->wqe_size > MLX4_MAX_WQE_SIZE)
 439                return -ENOMEM;
 440        /* Init pv. */
 441        tinfo->pv = (struct pv *)txq->bounce_buf;
 442        tinfo->pv_counter = 0;
 443        return 0;
 444}
 445
 446/**
 447 * Fill the TSO WQE data segments with info on buffers to transmit .
 448 *
 449 * @param buf
 450 *   Pointer to the first packet mbuf.
 451 * @param txq
 452 *   Pointer to Tx queue structure.
 453 * @param tinfo
 454 *   Pointer to TSO info to use.
 455 * @param dseg
 456 *   Pointer to the first data segment in the TSO WQE.
 457 * @param ctrl
 458 *   Pointer to the control segment in the TSO WQE.
 459 *
 460 * @return
 461 *   0 on success, negative value upon error.
 462 */
 463static inline volatile struct mlx4_wqe_ctrl_seg *
 464mlx4_tx_burst_fill_tso_dsegs(struct rte_mbuf *buf,
 465                             struct txq *txq,
 466                             struct tso_info *tinfo,
 467                             volatile struct mlx4_wqe_data_seg *dseg,
 468                             volatile struct mlx4_wqe_ctrl_seg *ctrl)
 469{
 470        uint32_t lkey;
 471        int nb_segs = buf->nb_segs;
 472        int nb_segs_txbb;
 473        struct mlx4_sq *sq = &txq->msq;
 474        struct rte_mbuf *sbuf = buf;
 475        struct pv *pv = tinfo->pv;
 476        int *pv_counter = &tinfo->pv_counter;
 477        volatile struct mlx4_wqe_ctrl_seg *ctrl_next =
 478                        (volatile struct mlx4_wqe_ctrl_seg *)
 479                                ((volatile uint8_t *)ctrl + tinfo->wqe_size);
 480        uint16_t data_len = sbuf->data_len - tinfo->tso_header_size;
 481        uintptr_t data_addr = rte_pktmbuf_mtod_offset(sbuf, uintptr_t,
 482                                                      tinfo->tso_header_size);
 483
 484        do {
 485                /* how many dseg entries do we have in the current TXBB ? */
 486                nb_segs_txbb = (MLX4_TXBB_SIZE -
 487                                ((uintptr_t)dseg & (MLX4_TXBB_SIZE - 1))) >>
 488                               MLX4_SEG_SHIFT;
 489                switch (nb_segs_txbb) {
 490#ifdef RTE_LIBRTE_MLX4_DEBUG
 491                default:
 492                        /* Should never happen. */
 493                        rte_panic("%p: Invalid number of SGEs(%d) for a TXBB",
 494                        (void *)txq, nb_segs_txbb);
 495                        /* rte_panic never returns. */
 496                        break;
 497#endif /* RTE_LIBRTE_MLX4_DEBUG */
 498                case 4:
 499                        /* Memory region key for this memory pool. */
 500                        lkey = mlx4_tx_mb2mr(txq, sbuf);
 501                        if (unlikely(lkey == (uint32_t)-1))
 502                                goto err;
 503                        dseg->addr = rte_cpu_to_be_64(data_addr);
 504                        dseg->lkey = lkey;
 505                        /*
 506                         * This data segment starts at the beginning of a new
 507                         * TXBB, so we need to postpone its byte_count writing
 508                         * for later.
 509                         */
 510                        pv[*pv_counter].dseg = dseg;
 511                        /*
 512                         * Zero length segment is treated as inline segment
 513                         * with zero data.
 514                         */
 515                        pv[(*pv_counter)++].val =
 516                                rte_cpu_to_be_32(data_len ?
 517                                                 data_len :
 518                                                 0x80000000);
 519                        if (--nb_segs == 0)
 520                                return ctrl_next;
 521                        /* Prepare next buf info */
 522                        sbuf = sbuf->next;
 523                        dseg++;
 524                        data_len = sbuf->data_len;
 525                        data_addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
 526                        /* fallthrough */
 527                case 3:
 528                        lkey = mlx4_tx_mb2mr(txq, sbuf);
 529                        if (unlikely(lkey == (uint32_t)-1))
 530                                goto err;
 531                        mlx4_fill_tx_data_seg(dseg, lkey, data_addr,
 532                                        rte_cpu_to_be_32(data_len ?
 533                                                         data_len :
 534                                                         0x80000000));
 535                        if (--nb_segs == 0)
 536                                return ctrl_next;
 537                        /* Prepare next buf info */
 538                        sbuf = sbuf->next;
 539                        dseg++;
 540                        data_len = sbuf->data_len;
 541                        data_addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
 542                        /* fallthrough */
 543                case 2:
 544                        lkey = mlx4_tx_mb2mr(txq, sbuf);
 545                        if (unlikely(lkey == (uint32_t)-1))
 546                                goto err;
 547                        mlx4_fill_tx_data_seg(dseg, lkey, data_addr,
 548                                        rte_cpu_to_be_32(data_len ?
 549                                                         data_len :
 550                                                         0x80000000));
 551                        if (--nb_segs == 0)
 552                                return ctrl_next;
 553                        /* Prepare next buf info */
 554                        sbuf = sbuf->next;
 555                        dseg++;
 556                        data_len = sbuf->data_len;
 557                        data_addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
 558                        /* fallthrough */
 559                case 1:
 560                        lkey = mlx4_tx_mb2mr(txq, sbuf);
 561                        if (unlikely(lkey == (uint32_t)-1))
 562                                goto err;
 563                        mlx4_fill_tx_data_seg(dseg, lkey, data_addr,
 564                                        rte_cpu_to_be_32(data_len ?
 565                                                         data_len :
 566                                                         0x80000000));
 567                        if (--nb_segs == 0)
 568                                return ctrl_next;
 569                        /* Prepare next buf info */
 570                        sbuf = sbuf->next;
 571                        dseg++;
 572                        data_len = sbuf->data_len;
 573                        data_addr = rte_pktmbuf_mtod(sbuf, uintptr_t);
 574                        /* fallthrough */
 575                }
 576                /* Wrap dseg if it points at the end of the queue. */
 577                if ((volatile uint8_t *)dseg >= sq->eob)
 578                        dseg = (volatile struct mlx4_wqe_data_seg *)
 579                                        ((volatile uint8_t *)dseg - sq->size);
 580        } while (true);
 581err:
 582        return NULL;
 583}
 584
 585/**
 586 * Fill the packet's l2, l3 and l4 headers to the WQE.
 587 *
 588 * This will be used as the header for each TSO segment that is transmitted.
 589 *
 590 * @param buf
 591 *   Pointer to the first packet mbuf.
 592 * @param txq
 593 *   Pointer to Tx queue structure.
 594 * @param tinfo
 595 *   Pointer to TSO info to use.
 596 * @param ctrl
 597 *   Pointer to the control segment in the TSO WQE.
 598 *
 599 * @return
 600 *   0 on success, negative value upon error.
 601 */
 602static inline volatile struct mlx4_wqe_data_seg *
 603mlx4_tx_burst_fill_tso_hdr(struct rte_mbuf *buf,
 604                           struct txq *txq,
 605                           struct tso_info *tinfo,
 606                           volatile struct mlx4_wqe_ctrl_seg *ctrl)
 607{
 608        volatile struct mlx4_wqe_lso_seg *tseg =
 609                (volatile struct mlx4_wqe_lso_seg *)(ctrl + 1);
 610        struct mlx4_sq *sq = &txq->msq;
 611        struct pv *pv = tinfo->pv;
 612        int *pv_counter = &tinfo->pv_counter;
 613        int remain_size = tinfo->tso_header_size;
 614        char *from = rte_pktmbuf_mtod(buf, char *);
 615        uint16_t txbb_avail_space;
 616        /* Union to overcome volatile constraints when copying TSO header. */
 617        union {
 618                volatile uint8_t *vto;
 619                uint8_t *to;
 620        } thdr = { .vto = (volatile uint8_t *)tseg->header, };
 621
 622        /*
 623         * TSO data always starts at offset 20 from the beginning of the TXBB
 624         * (16 byte ctrl + 4byte TSO desc). Since each TXBB is 64Byte aligned
 625         * we can write the first 44 TSO header bytes without worry for TxQ
 626         * wrapping or overwriting the first TXBB 32bit word.
 627         */
 628        txbb_avail_space = MLX4_TXBB_SIZE -
 629                           (sizeof(struct mlx4_wqe_ctrl_seg) +
 630                            sizeof(struct mlx4_wqe_lso_seg));
 631        while (remain_size >= (int)(txbb_avail_space + sizeof(uint32_t))) {
 632                /* Copy to end of txbb. */
 633                rte_memcpy(thdr.to, from, txbb_avail_space);
 634                from += txbb_avail_space;
 635                thdr.to += txbb_avail_space;
 636                /* New TXBB, Check for TxQ wrap. */
 637                if (thdr.to >= sq->eob)
 638                        thdr.vto = sq->buf;
 639                /* New TXBB, stash the first 32bits for later use. */
 640                pv[*pv_counter].dst = (volatile uint32_t *)thdr.to;
 641                pv[(*pv_counter)++].val = *(uint32_t *)from,
 642                from += sizeof(uint32_t);
 643                thdr.to += sizeof(uint32_t);
 644                remain_size -= txbb_avail_space + sizeof(uint32_t);
 645                /* Avail space in new TXBB is TXBB size - 4 */
 646                txbb_avail_space = MLX4_TXBB_SIZE - sizeof(uint32_t);
 647        }
 648        if (remain_size > txbb_avail_space) {
 649                rte_memcpy(thdr.to, from, txbb_avail_space);
 650                from += txbb_avail_space;
 651                thdr.to += txbb_avail_space;
 652                remain_size -= txbb_avail_space;
 653                /* New TXBB, Check for TxQ wrap. */
 654                if (thdr.to >= sq->eob)
 655                        thdr.vto = sq->buf;
 656                pv[*pv_counter].dst = (volatile uint32_t *)thdr.to;
 657                rte_memcpy(&pv[*pv_counter].val, from, remain_size);
 658                (*pv_counter)++;
 659        } else if (remain_size) {
 660                rte_memcpy(thdr.to, from, remain_size);
 661        }
 662        tseg->mss_hdr_size = rte_cpu_to_be_32((buf->tso_segsz << 16) |
 663                                              tinfo->tso_header_size);
 664        /* Calculate data segment location */
 665        return (volatile struct mlx4_wqe_data_seg *)
 666                                ((uintptr_t)tseg + tinfo->wqe_tso_seg_size);
 667}
 668
 669/**
 670 * Write data segments and header for TSO uni/multi segment packet.
 671 *
 672 * @param buf
 673 *   Pointer to the first packet mbuf.
 674 * @param txq
 675 *   Pointer to Tx queue structure.
 676 * @param ctrl
 677 *   Pointer to the WQE control segment.
 678 *
 679 * @return
 680 *   Pointer to the next WQE control segment on success, NULL otherwise.
 681 */
 682static volatile struct mlx4_wqe_ctrl_seg *
 683mlx4_tx_burst_tso(struct rte_mbuf *buf, struct txq *txq,
 684                  volatile struct mlx4_wqe_ctrl_seg *ctrl)
 685{
 686        volatile struct mlx4_wqe_data_seg *dseg;
 687        volatile struct mlx4_wqe_ctrl_seg *ctrl_next;
 688        struct mlx4_sq *sq = &txq->msq;
 689        struct tso_info tinfo;
 690        struct pv *pv;
 691        int pv_counter;
 692        int ret;
 693
 694        ret = mlx4_tx_burst_tso_get_params(buf, txq, &tinfo);
 695        if (unlikely(ret))
 696                goto error;
 697        dseg = mlx4_tx_burst_fill_tso_hdr(buf, txq, &tinfo, ctrl);
 698        if (unlikely(dseg == NULL))
 699                goto error;
 700        if ((uintptr_t)dseg >= (uintptr_t)sq->eob)
 701                dseg = (volatile struct mlx4_wqe_data_seg *)
 702                                        ((uintptr_t)dseg - sq->size);
 703        ctrl_next = mlx4_tx_burst_fill_tso_dsegs(buf, txq, &tinfo, dseg, ctrl);
 704        if (unlikely(ctrl_next == NULL))
 705                goto error;
 706        /* Write the first DWORD of each TXBB save earlier. */
 707        if (likely(tinfo.pv_counter)) {
 708                pv = tinfo.pv;
 709                pv_counter = tinfo.pv_counter;
 710                /* Need a barrier here before writing the first TXBB word. */
 711                rte_io_wmb();
 712                do {
 713                        --pv_counter;
 714                        *pv[pv_counter].dst = pv[pv_counter].val;
 715                } while (pv_counter > 0);
 716        }
 717        ctrl->fence_size = tinfo.fence_size;
 718        sq->remain_size -= tinfo.wqe_size;
 719        return ctrl_next;
 720error:
 721        txq->stats.odropped++;
 722        return NULL;
 723}
 724
 725/**
 726 * Write data segments of multi-segment packet.
 727 *
 728 * @param buf
 729 *   Pointer to the first packet mbuf.
 730 * @param txq
 731 *   Pointer to Tx queue structure.
 732 * @param ctrl
 733 *   Pointer to the WQE control segment.
 734 *
 735 * @return
 736 *   Pointer to the next WQE control segment on success, NULL otherwise.
 737 */
 738static volatile struct mlx4_wqe_ctrl_seg *
 739mlx4_tx_burst_segs(struct rte_mbuf *buf, struct txq *txq,
 740                   volatile struct mlx4_wqe_ctrl_seg *ctrl)
 741{
 742        struct pv *pv = (struct pv *)txq->bounce_buf;
 743        struct mlx4_sq *sq = &txq->msq;
 744        struct rte_mbuf *sbuf = buf;
 745        uint32_t lkey;
 746        int pv_counter = 0;
 747        int nb_segs = buf->nb_segs;
 748        uint32_t wqe_size;
 749        volatile struct mlx4_wqe_data_seg *dseg =
 750                (volatile struct mlx4_wqe_data_seg *)(ctrl + 1);
 751
 752        ctrl->fence_size = 1 + nb_segs;
 753        wqe_size = RTE_ALIGN((uint32_t)(ctrl->fence_size << MLX4_SEG_SHIFT),
 754                             MLX4_TXBB_SIZE);
 755        /* Validate WQE size and WQE space in the send queue. */
 756        if (sq->remain_size < wqe_size ||
 757            wqe_size > MLX4_MAX_WQE_SIZE)
 758                return NULL;
 759        /*
 760         * Fill the data segments with buffer information.
 761         * First WQE TXBB head segment is always control segment,
 762         * so jump to tail TXBB data segments code for the first
 763         * WQE data segments filling.
 764         */
 765        goto txbb_tail_segs;
 766txbb_head_seg:
 767        /* Memory region key (big endian) for this memory pool. */
 768        lkey = mlx4_tx_mb2mr(txq, sbuf);
 769        if (unlikely(lkey == (uint32_t)-1)) {
 770                DEBUG("%p: unable to get MP <-> MR association",
 771                      (void *)txq);
 772                return NULL;
 773        }
 774        /* Handle WQE wraparound. */
 775        if (dseg >=
 776                (volatile struct mlx4_wqe_data_seg *)sq->eob)
 777                dseg = (volatile struct mlx4_wqe_data_seg *)
 778                        sq->buf;
 779        dseg->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(sbuf, uintptr_t));
 780        dseg->lkey = lkey;
 781        /*
 782         * This data segment starts at the beginning of a new
 783         * TXBB, so we need to postpone its byte_count writing
 784         * for later.
 785         */
 786        pv[pv_counter].dseg = dseg;
 787        /*
 788         * Zero length segment is treated as inline segment
 789         * with zero data.
 790         */
 791        pv[pv_counter++].val = rte_cpu_to_be_32(sbuf->data_len ?
 792                                                sbuf->data_len : 0x80000000);
 793        sbuf = sbuf->next;
 794        dseg++;
 795        nb_segs--;
 796txbb_tail_segs:
 797        /* Jump to default if there are more than two segments remaining. */
 798        switch (nb_segs) {
 799        default:
 800                lkey = mlx4_tx_mb2mr(txq, sbuf);
 801                if (unlikely(lkey == (uint32_t)-1)) {
 802                        DEBUG("%p: unable to get MP <-> MR association",
 803                              (void *)txq);
 804                        return NULL;
 805                }
 806                mlx4_fill_tx_data_seg(dseg, lkey,
 807                                      rte_pktmbuf_mtod(sbuf, uintptr_t),
 808                                      rte_cpu_to_be_32(sbuf->data_len ?
 809                                                       sbuf->data_len :
 810                                                       0x80000000));
 811                sbuf = sbuf->next;
 812                dseg++;
 813                nb_segs--;
 814                /* fallthrough */
 815        case 2:
 816                lkey = mlx4_tx_mb2mr(txq, sbuf);
 817                if (unlikely(lkey == (uint32_t)-1)) {
 818                        DEBUG("%p: unable to get MP <-> MR association",
 819                              (void *)txq);
 820                        return NULL;
 821                }
 822                mlx4_fill_tx_data_seg(dseg, lkey,
 823                                      rte_pktmbuf_mtod(sbuf, uintptr_t),
 824                                      rte_cpu_to_be_32(sbuf->data_len ?
 825                                                       sbuf->data_len :
 826                                                       0x80000000));
 827                sbuf = sbuf->next;
 828                dseg++;
 829                nb_segs--;
 830                /* fallthrough */
 831        case 1:
 832                lkey = mlx4_tx_mb2mr(txq, sbuf);
 833                if (unlikely(lkey == (uint32_t)-1)) {
 834                        DEBUG("%p: unable to get MP <-> MR association",
 835                              (void *)txq);
 836                        return NULL;
 837                }
 838                mlx4_fill_tx_data_seg(dseg, lkey,
 839                                      rte_pktmbuf_mtod(sbuf, uintptr_t),
 840                                      rte_cpu_to_be_32(sbuf->data_len ?
 841                                                       sbuf->data_len :
 842                                                       0x80000000));
 843                nb_segs--;
 844                if (nb_segs) {
 845                        sbuf = sbuf->next;
 846                        dseg++;
 847                        goto txbb_head_seg;
 848                }
 849                /* fallthrough */
 850        case 0:
 851                break;
 852        }
 853        /* Write the first DWORD of each TXBB save earlier. */
 854        if (pv_counter) {
 855                /* Need a barrier here before writing the byte_count. */
 856                rte_io_wmb();
 857                for (--pv_counter; pv_counter  >= 0; pv_counter--)
 858                        pv[pv_counter].dseg->byte_count = pv[pv_counter].val;
 859        }
 860        sq->remain_size -= wqe_size;
 861        /* Align next WQE address to the next TXBB. */
 862        return (volatile struct mlx4_wqe_ctrl_seg *)
 863                ((volatile uint8_t *)ctrl + wqe_size);
 864}
 865
 866/**
 867 * DPDK callback for Tx.
 868 *
 869 * @param dpdk_txq
 870 *   Generic pointer to Tx queue structure.
 871 * @param[in] pkts
 872 *   Packets to transmit.
 873 * @param pkts_n
 874 *   Number of packets in array.
 875 *
 876 * @return
 877 *   Number of packets successfully transmitted (<= pkts_n).
 878 */
 879uint16_t
 880mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 881{
 882        struct txq *txq = (struct txq *)dpdk_txq;
 883        unsigned int elts_head = txq->elts_head;
 884        const unsigned int elts_n = txq->elts_n;
 885        const unsigned int elts_m = elts_n - 1;
 886        unsigned int bytes_sent = 0;
 887        unsigned int i;
 888        unsigned int max = elts_head - txq->elts_tail;
 889        struct mlx4_sq *sq = &txq->msq;
 890        volatile struct mlx4_wqe_ctrl_seg *ctrl;
 891        struct txq_elt *elt;
 892
 893        MLX4_ASSERT(txq->elts_comp_cd != 0);
 894        if (likely(max >= txq->elts_comp_cd_init))
 895                mlx4_txq_complete(txq, elts_m, sq);
 896        max = elts_n - max;
 897        MLX4_ASSERT(max >= 1);
 898        MLX4_ASSERT(max <= elts_n);
 899        /* Always leave one free entry in the ring. */
 900        --max;
 901        if (max > pkts_n)
 902                max = pkts_n;
 903        elt = &(*txq->elts)[elts_head & elts_m];
 904        /* First Tx burst element saves the next WQE control segment. */
 905        ctrl = elt->wqe;
 906        for (i = 0; (i != max); ++i) {
 907                struct rte_mbuf *buf = pkts[i];
 908                struct txq_elt *elt_next = &(*txq->elts)[++elts_head & elts_m];
 909                uint32_t owner_opcode = sq->owner_opcode;
 910                volatile struct mlx4_wqe_data_seg *dseg =
 911                                (volatile struct mlx4_wqe_data_seg *)(ctrl + 1);
 912                volatile struct mlx4_wqe_ctrl_seg *ctrl_next;
 913                union {
 914                        uint32_t flags;
 915                        uint16_t flags16[2];
 916                } srcrb;
 917                uint32_t lkey;
 918                bool tso = txq->priv->tso && (buf->ol_flags & PKT_TX_TCP_SEG);
 919
 920                /* Clean up old buffer. */
 921                if (likely(elt->buf != NULL)) {
 922                        struct rte_mbuf *tmp = elt->buf;
 923
 924                        /* Faster than rte_pktmbuf_free(). */
 925                        do {
 926                                struct rte_mbuf *next = tmp->next;
 927
 928                                rte_pktmbuf_free_seg(tmp);
 929                                tmp = next;
 930                        } while (tmp != NULL);
 931                }
 932                RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
 933                if (tso) {
 934                        /* Change opcode to TSO */
 935                        owner_opcode &= ~MLX4_OPCODE_CONFIG_CMD;
 936                        owner_opcode |= MLX4_OPCODE_LSO | MLX4_WQE_CTRL_RR;
 937                        ctrl_next = mlx4_tx_burst_tso(buf, txq, ctrl);
 938                        if (!ctrl_next) {
 939                                elt->buf = NULL;
 940                                break;
 941                        }
 942                } else if (buf->nb_segs == 1) {
 943                        /* Validate WQE space in the send queue. */
 944                        if (sq->remain_size < MLX4_TXBB_SIZE) {
 945                                elt->buf = NULL;
 946                                break;
 947                        }
 948                        lkey = mlx4_tx_mb2mr(txq, buf);
 949                        if (unlikely(lkey == (uint32_t)-1)) {
 950                                /* MR does not exist. */
 951                                DEBUG("%p: unable to get MP <-> MR association",
 952                                      (void *)txq);
 953                                elt->buf = NULL;
 954                                break;
 955                        }
 956                        mlx4_fill_tx_data_seg(dseg++, lkey,
 957                                              rte_pktmbuf_mtod(buf, uintptr_t),
 958                                              rte_cpu_to_be_32(buf->data_len));
 959                        /* Set WQE size in 16-byte units. */
 960                        ctrl->fence_size = 0x2;
 961                        sq->remain_size -= MLX4_TXBB_SIZE;
 962                        /* Align next WQE address to the next TXBB. */
 963                        ctrl_next = ctrl + 0x4;
 964                } else {
 965                        ctrl_next = mlx4_tx_burst_segs(buf, txq, ctrl);
 966                        if (!ctrl_next) {
 967                                elt->buf = NULL;
 968                                break;
 969                        }
 970                }
 971                /* Hold SQ ring wrap around. */
 972                if ((volatile uint8_t *)ctrl_next >= sq->eob) {
 973                        ctrl_next = (volatile struct mlx4_wqe_ctrl_seg *)
 974                                ((volatile uint8_t *)ctrl_next - sq->size);
 975                        /* Flip HW valid ownership. */
 976                        sq->owner_opcode ^= 1u << MLX4_SQ_OWNER_BIT;
 977                }
 978                /*
 979                 * For raw Ethernet, the SOLICIT flag is used to indicate
 980                 * that no ICRC should be calculated.
 981                 */
 982                if (--txq->elts_comp_cd == 0) {
 983                        /* Save the completion burst end address. */
 984                        elt_next->eocb = (volatile uint32_t *)ctrl_next;
 985                        txq->elts_comp_cd = txq->elts_comp_cd_init;
 986                        srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT |
 987                                               MLX4_WQE_CTRL_CQ_UPDATE);
 988                } else {
 989                        srcrb.flags = RTE_BE32(MLX4_WQE_CTRL_SOLICIT);
 990                }
 991                /* Enable HW checksum offload if requested */
 992                if (txq->csum &&
 993                    (buf->ol_flags &
 994                     (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))) {
 995                        const uint64_t is_tunneled = (buf->ol_flags &
 996                                                      (PKT_TX_TUNNEL_GRE |
 997                                                       PKT_TX_TUNNEL_VXLAN));
 998
 999                        if (is_tunneled && txq->csum_l2tun) {
1000                                owner_opcode |= MLX4_WQE_CTRL_IIP_HDR_CSUM |
1001                                                MLX4_WQE_CTRL_IL4_HDR_CSUM;
1002                                if (buf->ol_flags & PKT_TX_OUTER_IP_CKSUM)
1003                                        srcrb.flags |=
1004                                            RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM);
1005                        } else {
1006                                srcrb.flags |=
1007                                        RTE_BE32(MLX4_WQE_CTRL_IP_HDR_CSUM |
1008                                                MLX4_WQE_CTRL_TCP_UDP_CSUM);
1009                        }
1010                }
1011                if (txq->lb) {
1012                        /*
1013                         * Copy destination MAC address to the WQE, this allows
1014                         * loopback in eSwitch, so that VFs and PF can
1015                         * communicate with each other.
1016                         */
1017                        srcrb.flags16[0] = *(rte_pktmbuf_mtod(buf, uint16_t *));
1018                        ctrl->imm = *(rte_pktmbuf_mtod_offset(buf, uint32_t *,
1019                                              sizeof(uint16_t)));
1020                } else {
1021                        ctrl->imm = 0;
1022                }
1023                ctrl->srcrb_flags = srcrb.flags;
1024                /*
1025                 * Make sure descriptor is fully written before
1026                 * setting ownership bit (because HW can start
1027                 * executing as soon as we do).
1028                 */
1029                rte_io_wmb();
1030                ctrl->owner_opcode = rte_cpu_to_be_32(owner_opcode);
1031                elt->buf = buf;
1032                bytes_sent += buf->pkt_len;
1033                ctrl = ctrl_next;
1034                elt = elt_next;
1035        }
1036        /* Take a shortcut if nothing must be sent. */
1037        if (unlikely(i == 0))
1038                return 0;
1039        /* Save WQE address of the next Tx burst element. */
1040        elt->wqe = ctrl;
1041        /* Increment send statistics counters. */
1042        txq->stats.opackets += i;
1043        txq->stats.obytes += bytes_sent;
1044        /* Make sure that descriptors are written before doorbell record. */
1045        rte_wmb();
1046        /* Ring QP doorbell. */
1047        rte_write32(txq->msq.doorbell_qpn, MLX4_TX_BFREG(txq));
1048        txq->elts_head += i;
1049        return i;
1050}
1051
1052/**
1053 * Translate Rx completion flags to packet type.
1054 *
1055 * @param[in] cqe
1056 *   Pointer to CQE.
1057 *
1058 * @return
1059 *   Packet type for struct rte_mbuf.
1060 */
1061static inline uint32_t
1062rxq_cq_to_pkt_type(volatile struct mlx4_cqe *cqe,
1063                   uint32_t l2tun_offload)
1064{
1065        uint8_t idx = 0;
1066        uint32_t pinfo = rte_be_to_cpu_32(cqe->vlan_my_qpn);
1067        uint32_t status = rte_be_to_cpu_32(cqe->status);
1068
1069        /*
1070         * The index to the array should have:
1071         *  bit[7] - MLX4_CQE_L2_TUNNEL
1072         *  bit[6] - MLX4_CQE_L2_TUNNEL_IPV4
1073         */
1074        if (l2tun_offload && (pinfo & MLX4_CQE_L2_TUNNEL))
1075                idx |= ((pinfo & MLX4_CQE_L2_TUNNEL) >> 20) |
1076                       ((pinfo & MLX4_CQE_L2_TUNNEL_IPV4) >> 19);
1077        /*
1078         * The index to the array should have:
1079         *  bit[5] - MLX4_CQE_STATUS_UDP
1080         *  bit[4] - MLX4_CQE_STATUS_TCP
1081         *  bit[3] - MLX4_CQE_STATUS_IPV4OPT
1082         *  bit[2] - MLX4_CQE_STATUS_IPV6
1083         *  bit[1] - MLX4_CQE_STATUS_IPF
1084         *  bit[0] - MLX4_CQE_STATUS_IPV4
1085         * giving a total of up to 256 entries.
1086         */
1087        idx |= ((status & MLX4_CQE_STATUS_PTYPE_MASK) >> 22);
1088        if (status & MLX4_CQE_STATUS_IPV6)
1089                idx |= ((status & MLX4_CQE_STATUS_IPV6F) >> 11);
1090        return mlx4_ptype_table[idx];
1091}
1092
1093/**
1094 * Translate Rx completion flags to offload flags.
1095 *
1096 * @param flags
1097 *   Rx completion flags returned by mlx4_cqe_flags().
1098 * @param csum
1099 *   Whether Rx checksums are enabled.
1100 * @param csum_l2tun
1101 *   Whether Rx L2 tunnel checksums are enabled.
1102 *
1103 * @return
1104 *   Offload flags (ol_flags) in mbuf format.
1105 */
1106static inline uint32_t
1107rxq_cq_to_ol_flags(uint32_t flags, int csum, int csum_l2tun)
1108{
1109        uint32_t ol_flags = 0;
1110
1111        if (csum)
1112                ol_flags |=
1113                        mlx4_transpose(flags,
1114                                       MLX4_CQE_STATUS_IP_HDR_CSUM_OK,
1115                                       PKT_RX_IP_CKSUM_GOOD) |
1116                        mlx4_transpose(flags,
1117                                       MLX4_CQE_STATUS_TCP_UDP_CSUM_OK,
1118                                       PKT_RX_L4_CKSUM_GOOD);
1119        if ((flags & MLX4_CQE_L2_TUNNEL) && csum_l2tun)
1120                ol_flags |=
1121                        mlx4_transpose(flags,
1122                                       MLX4_CQE_L2_TUNNEL_IPOK,
1123                                       PKT_RX_IP_CKSUM_GOOD) |
1124                        mlx4_transpose(flags,
1125                                       MLX4_CQE_L2_TUNNEL_L4_CSUM,
1126                                       PKT_RX_L4_CKSUM_GOOD);
1127        return ol_flags;
1128}
1129
1130/**
1131 * Extract checksum information from CQE flags.
1132 *
1133 * @param cqe
1134 *   Pointer to CQE structure.
1135 * @param csum
1136 *   Whether Rx checksums are enabled.
1137 * @param csum_l2tun
1138 *   Whether Rx L2 tunnel checksums are enabled.
1139 *
1140 * @return
1141 *   CQE checksum information.
1142 */
1143static inline uint32_t
1144mlx4_cqe_flags(volatile struct mlx4_cqe *cqe, int csum, int csum_l2tun)
1145{
1146        uint32_t flags = 0;
1147
1148        /*
1149         * The relevant bits are in different locations on their
1150         * CQE fields therefore we can join them in one 32bit
1151         * variable.
1152         */
1153        if (csum)
1154                flags = (rte_be_to_cpu_32(cqe->status) &
1155                         MLX4_CQE_STATUS_IPV4_CSUM_OK);
1156        if (csum_l2tun)
1157                flags |= (rte_be_to_cpu_32(cqe->vlan_my_qpn) &
1158                          (MLX4_CQE_L2_TUNNEL |
1159                           MLX4_CQE_L2_TUNNEL_IPOK |
1160                           MLX4_CQE_L2_TUNNEL_L4_CSUM |
1161                           MLX4_CQE_L2_TUNNEL_IPV4));
1162        return flags;
1163}
1164
1165/**
1166 * Poll one CQE from CQ.
1167 *
1168 * @param rxq
1169 *   Pointer to the receive queue structure.
1170 * @param[out] out
1171 *   Just polled CQE.
1172 *
1173 * @return
1174 *   Number of bytes of the CQE, 0 in case there is no completion.
1175 */
1176static unsigned int
1177mlx4_cq_poll_one(struct rxq *rxq, volatile struct mlx4_cqe **out)
1178{
1179        int ret = 0;
1180        volatile struct mlx4_cqe *cqe = NULL;
1181        struct mlx4_cq *cq = &rxq->mcq;
1182
1183        cqe = (volatile struct mlx4_cqe *)mlx4_get_cqe(cq, cq->cons_index);
1184        if (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
1185            !!(cq->cons_index & cq->cqe_cnt))
1186                goto out;
1187        /*
1188         * Make sure we read CQ entry contents after we've checked the
1189         * ownership bit.
1190         */
1191        rte_rmb();
1192        MLX4_ASSERT(!(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK));
1193        MLX4_ASSERT((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) !=
1194                    MLX4_CQE_OPCODE_ERROR);
1195        ret = rte_be_to_cpu_32(cqe->byte_cnt);
1196        ++cq->cons_index;
1197out:
1198        *out = cqe;
1199        return ret;
1200}
1201
1202/**
1203 * DPDK callback for Rx with scattered packets support.
1204 *
1205 * @param dpdk_rxq
1206 *   Generic pointer to Rx queue structure.
1207 * @param[out] pkts
1208 *   Array to store received packets.
1209 * @param pkts_n
1210 *   Maximum number of packets in array.
1211 *
1212 * @return
1213 *   Number of packets successfully received (<= pkts_n).
1214 */
1215uint16_t
1216mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
1217{
1218        struct rxq *rxq = dpdk_rxq;
1219        const uint32_t wr_cnt = (1 << rxq->elts_n) - 1;
1220        const uint16_t sges_n = rxq->sges_n;
1221        struct rte_mbuf *pkt = NULL;
1222        struct rte_mbuf *seg = NULL;
1223        unsigned int i = 0;
1224        uint32_t rq_ci = rxq->rq_ci << sges_n;
1225        int len = 0;
1226
1227        while (pkts_n) {
1228                volatile struct mlx4_cqe *cqe;
1229                uint32_t idx = rq_ci & wr_cnt;
1230                struct rte_mbuf *rep = (*rxq->elts)[idx];
1231                volatile struct mlx4_wqe_data_seg *scat = &(*rxq->wqes)[idx];
1232
1233                /* Update the 'next' pointer of the previous segment. */
1234                if (pkt)
1235                        seg->next = rep;
1236                seg = rep;
1237                rte_prefetch0(seg);
1238                rte_prefetch0(scat);
1239                rep = rte_mbuf_raw_alloc(rxq->mp);
1240                if (unlikely(rep == NULL)) {
1241                        ++rxq->stats.rx_nombuf;
1242                        if (!pkt) {
1243                                /*
1244                                 * No buffers before we even started,
1245                                 * bail out silently.
1246                                 */
1247                                break;
1248                        }
1249                        while (pkt != seg) {
1250                                MLX4_ASSERT(pkt != (*rxq->elts)[idx]);
1251                                rep = pkt->next;
1252                                pkt->next = NULL;
1253                                pkt->nb_segs = 1;
1254                                rte_mbuf_raw_free(pkt);
1255                                pkt = rep;
1256                        }
1257                        break;
1258                }
1259                if (!pkt) {
1260                        /* Looking for the new packet. */
1261                        len = mlx4_cq_poll_one(rxq, &cqe);
1262                        if (!len) {
1263                                rte_mbuf_raw_free(rep);
1264                                break;
1265                        }
1266                        if (unlikely(len < 0)) {
1267                                /* Rx error, packet is likely too large. */
1268                                rte_mbuf_raw_free(rep);
1269                                ++rxq->stats.idropped;
1270                                goto skip;
1271                        }
1272                        pkt = seg;
1273                        MLX4_ASSERT(len >= (rxq->crc_present << 2));
1274                        /* Update packet information. */
1275                        pkt->packet_type =
1276                                rxq_cq_to_pkt_type(cqe, rxq->l2tun_offload);
1277                        pkt->ol_flags = PKT_RX_RSS_HASH;
1278                        pkt->hash.rss = cqe->immed_rss_invalid;
1279                        if (rxq->crc_present)
1280                                len -= RTE_ETHER_CRC_LEN;
1281                        pkt->pkt_len = len;
1282                        if (rxq->csum | rxq->csum_l2tun) {
1283                                uint32_t flags =
1284                                        mlx4_cqe_flags(cqe,
1285                                                       rxq->csum,
1286                                                       rxq->csum_l2tun);
1287
1288                                pkt->ol_flags =
1289                                        rxq_cq_to_ol_flags(flags,
1290                                                           rxq->csum,
1291                                                           rxq->csum_l2tun);
1292                        }
1293                }
1294                rep->nb_segs = 1;
1295                rep->port = rxq->port_id;
1296                rep->data_len = seg->data_len;
1297                rep->data_off = seg->data_off;
1298                (*rxq->elts)[idx] = rep;
1299                /*
1300                 * Fill NIC descriptor with the new buffer. The lkey and size
1301                 * of the buffers are already known, only the buffer address
1302                 * changes.
1303                 */
1304                scat->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t));
1305                /* If there's only one MR, no need to replace LKey in WQE. */
1306                if (unlikely(mlx4_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
1307                        scat->lkey = mlx4_rx_mb2mr(rxq, rep);
1308                if (len > seg->data_len) {
1309                        len -= seg->data_len;
1310                        ++pkt->nb_segs;
1311                        ++rq_ci;
1312                        continue;
1313                }
1314                /* The last segment. */
1315                seg->data_len = len;
1316                /* Increment bytes counter. */
1317                rxq->stats.ibytes += pkt->pkt_len;
1318                /* Return packet. */
1319                *(pkts++) = pkt;
1320                pkt = NULL;
1321                --pkts_n;
1322                ++i;
1323skip:
1324                /* Align consumer index to the next stride. */
1325                rq_ci >>= sges_n;
1326                ++rq_ci;
1327                rq_ci <<= sges_n;
1328        }
1329        if (unlikely(i == 0 && (rq_ci >> sges_n) == rxq->rq_ci))
1330                return 0;
1331        /* Update the consumer index. */
1332        rxq->rq_ci = rq_ci >> sges_n;
1333        rte_wmb();
1334        *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
1335        *rxq->mcq.set_ci_db =
1336                rte_cpu_to_be_32(rxq->mcq.cons_index & MLX4_CQ_DB_CI_MASK);
1337        /* Increment packets counter. */
1338        rxq->stats.ipackets += i;
1339        return i;
1340}
1341
1342/**
1343 * Dummy DPDK callback for Tx.
1344 *
1345 * This function is used to temporarily replace the real callback during
1346 * unsafe control operations on the queue, or in case of error.
1347 *
1348 * @param dpdk_txq
1349 *   Generic pointer to Tx queue structure.
1350 * @param[in] pkts
1351 *   Packets to transmit.
1352 * @param pkts_n
1353 *   Number of packets in array.
1354 *
1355 * @return
1356 *   Number of packets successfully transmitted (<= pkts_n).
1357 */
1358uint16_t
1359mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
1360{
1361        (void)dpdk_txq;
1362        (void)pkts;
1363        (void)pkts_n;
1364        rte_mb();
1365        return 0;
1366}
1367
1368/**
1369 * Dummy DPDK callback for Rx.
1370 *
1371 * This function is used to temporarily replace the real callback during
1372 * unsafe control operations on the queue, or in case of error.
1373 *
1374 * @param dpdk_rxq
1375 *   Generic pointer to Rx queue structure.
1376 * @param[out] pkts
1377 *   Array to store received packets.
1378 * @param pkts_n
1379 *   Maximum number of packets in array.
1380 *
1381 * @return
1382 *   Number of packets successfully received (<= pkts_n).
1383 */
1384uint16_t
1385mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
1386{
1387        (void)dpdk_rxq;
1388        (void)pkts;
1389        (void)pkts_n;
1390        rte_mb();
1391        return 0;
1392}
1393