dpdk/drivers/net/octeontx2/otx2_tx.c
<<
>>
Prefs
   1/* SPDX-License-Identifier: BSD-3-Clause
   2 * Copyright(C) 2019 Marvell International Ltd.
   3 */
   4
   5#include <rte_vect.h>
   6
   7#include "otx2_ethdev.h"
   8
   9#define NIX_XMIT_FC_OR_RETURN(txq, pkts) do {                           \
  10        /* Cached value is low, Update the fc_cache_pkts */             \
  11        if (unlikely((txq)->fc_cache_pkts < (pkts))) {                  \
  12                /* Multiply with sqe_per_sqb to express in pkts */      \
  13                (txq)->fc_cache_pkts =                                  \
  14                        ((txq)->nb_sqb_bufs_adj - *(txq)->fc_mem) <<    \
  15                                (txq)->sqes_per_sqb_log2;               \
  16                /* Check it again for the room */                       \
  17                if (unlikely((txq)->fc_cache_pkts < (pkts)))            \
  18                        return 0;                                       \
  19        }                                                               \
  20} while (0)
  21
  22
  23static __rte_always_inline uint16_t
  24nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
  25              uint16_t pkts, uint64_t *cmd, const uint16_t flags)
  26{
  27        struct otx2_eth_txq *txq = tx_queue; uint16_t i;
  28        const rte_iova_t io_addr = txq->io_addr;
  29        void *lmt_addr = txq->lmt_addr;
  30        uint64_t lso_tun_fmt;
  31
  32        NIX_XMIT_FC_OR_RETURN(txq, pkts);
  33
  34        otx2_lmt_mov(cmd, &txq->cmd[0], otx2_nix_tx_ext_subs(flags));
  35
  36        /* Perform header writes before barrier for TSO */
  37        if (flags & NIX_TX_OFFLOAD_TSO_F) {
  38                lso_tun_fmt = txq->lso_tun_fmt;
  39                for (i = 0; i < pkts; i++)
  40                        otx2_nix_xmit_prepare_tso(tx_pkts[i], flags);
  41        }
  42
  43        /* Lets commit any changes in the packet here as no further changes
  44         * to the packet will be done unless no fast free is enabled.
  45         */
  46        if (!(flags & NIX_TX_OFFLOAD_MBUF_NOFF_F))
  47                rte_io_wmb();
  48
  49        for (i = 0; i < pkts; i++) {
  50                otx2_nix_xmit_prepare(tx_pkts[i], cmd, flags, lso_tun_fmt);
  51                /* Passing no of segdw as 4: HDR + EXT + SG + SMEM */
  52                otx2_nix_xmit_prepare_tstamp(cmd, &txq->cmd[0],
  53                                             tx_pkts[i]->ol_flags, 4, flags);
  54                otx2_nix_xmit_one(cmd, lmt_addr, io_addr, flags);
  55        }
  56
  57        /* Reduce the cached count */
  58        txq->fc_cache_pkts -= pkts;
  59
  60        return pkts;
  61}
  62
  63static __rte_always_inline uint16_t
  64nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
  65                   uint16_t pkts, uint64_t *cmd, const uint16_t flags)
  66{
  67        struct otx2_eth_txq *txq = tx_queue; uint64_t i;
  68        const rte_iova_t io_addr = txq->io_addr;
  69        void *lmt_addr = txq->lmt_addr;
  70        uint64_t lso_tun_fmt;
  71        uint16_t segdw;
  72
  73        NIX_XMIT_FC_OR_RETURN(txq, pkts);
  74
  75        otx2_lmt_mov(cmd, &txq->cmd[0], otx2_nix_tx_ext_subs(flags));
  76
  77        /* Perform header writes before barrier for TSO */
  78        if (flags & NIX_TX_OFFLOAD_TSO_F) {
  79                lso_tun_fmt = txq->lso_tun_fmt;
  80                for (i = 0; i < pkts; i++)
  81                        otx2_nix_xmit_prepare_tso(tx_pkts[i], flags);
  82        }
  83
  84        /* Lets commit any changes in the packet here as no further changes
  85         * to the packet will be done unless no fast free is enabled.
  86         */
  87        if (!(flags & NIX_TX_OFFLOAD_MBUF_NOFF_F))
  88                rte_io_wmb();
  89
  90        for (i = 0; i < pkts; i++) {
  91                otx2_nix_xmit_prepare(tx_pkts[i], cmd, flags, lso_tun_fmt);
  92                segdw = otx2_nix_prepare_mseg(tx_pkts[i], cmd, flags);
  93                otx2_nix_xmit_prepare_tstamp(cmd, &txq->cmd[0],
  94                                             tx_pkts[i]->ol_flags, segdw,
  95                                             flags);
  96                otx2_nix_xmit_mseg_one(cmd, lmt_addr, io_addr, segdw);
  97        }
  98
  99        /* Reduce the cached count */
 100        txq->fc_cache_pkts -= pkts;
 101
 102        return pkts;
 103}
 104
 105#if defined(RTE_ARCH_ARM64)
 106
 107#define NIX_DESCS_PER_LOOP      4
 108static __rte_always_inline uint16_t
 109nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 110                     uint16_t pkts, uint64_t *cmd, const uint16_t flags)
 111{
 112        uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
 113        uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
 114        uint64_t *mbuf0, *mbuf1, *mbuf2, *mbuf3;
 115        uint64x2_t senddesc01_w0, senddesc23_w0;
 116        uint64x2_t senddesc01_w1, senddesc23_w1;
 117        uint64x2_t sgdesc01_w0, sgdesc23_w0;
 118        uint64x2_t sgdesc01_w1, sgdesc23_w1;
 119        struct otx2_eth_txq *txq = tx_queue;
 120        uint64_t *lmt_addr = txq->lmt_addr;
 121        rte_iova_t io_addr = txq->io_addr;
 122        uint64x2_t ltypes01, ltypes23;
 123        uint64x2_t xtmp128, ytmp128;
 124        uint64x2_t xmask01, xmask23;
 125        uint64x2_t cmd00, cmd01;
 126        uint64x2_t cmd10, cmd11;
 127        uint64x2_t cmd20, cmd21;
 128        uint64x2_t cmd30, cmd31;
 129        uint64_t lmt_status, i;
 130        uint16_t pkts_left;
 131
 132        NIX_XMIT_FC_OR_RETURN(txq, pkts);
 133
 134        pkts_left = pkts & (NIX_DESCS_PER_LOOP - 1);
 135        pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
 136
 137        /* Reduce the cached count */
 138        txq->fc_cache_pkts -= pkts;
 139
 140        /* Lets commit any changes in the packet here as no further changes
 141         * to the packet will be done unless no fast free is enabled.
 142         */
 143        if (!(flags & NIX_TX_OFFLOAD_MBUF_NOFF_F))
 144                rte_io_wmb();
 145
 146        senddesc01_w0 = vld1q_dup_u64(&txq->cmd[0]);
 147        senddesc23_w0 = senddesc01_w0;
 148        senddesc01_w1 = vdupq_n_u64(0);
 149        senddesc23_w1 = senddesc01_w1;
 150        sgdesc01_w0 = vld1q_dup_u64(&txq->cmd[2]);
 151        sgdesc23_w0 = sgdesc01_w0;
 152
 153        for (i = 0; i < pkts; i += NIX_DESCS_PER_LOOP) {
 154                /* Clear lower 32bit of SEND_HDR_W0 and SEND_SG_W0 */
 155                senddesc01_w0 = vbicq_u64(senddesc01_w0,
 156                                          vdupq_n_u64(0xFFFFFFFF));
 157                sgdesc01_w0 = vbicq_u64(sgdesc01_w0,
 158                                        vdupq_n_u64(0xFFFFFFFF));
 159
 160                senddesc23_w0 = senddesc01_w0;
 161                sgdesc23_w0 = sgdesc01_w0;
 162
 163                /* Move mbufs to iova */
 164                mbuf0 = (uint64_t *)tx_pkts[0];
 165                mbuf1 = (uint64_t *)tx_pkts[1];
 166                mbuf2 = (uint64_t *)tx_pkts[2];
 167                mbuf3 = (uint64_t *)tx_pkts[3];
 168
 169                mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
 170                                     offsetof(struct rte_mbuf, buf_iova));
 171                mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
 172                                     offsetof(struct rte_mbuf, buf_iova));
 173                mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
 174                                     offsetof(struct rte_mbuf, buf_iova));
 175                mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
 176                                     offsetof(struct rte_mbuf, buf_iova));
 177                /*
 178                 * Get mbuf's, olflags, iova, pktlen, dataoff
 179                 * dataoff_iovaX.D[0] = iova,
 180                 * dataoff_iovaX.D[1](15:0) = mbuf->dataoff
 181                 * len_olflagsX.D[0] = ol_flags,
 182                 * len_olflagsX.D[1](63:32) = mbuf->pkt_len
 183                 */
 184                dataoff_iova0  = vld1q_u64(mbuf0);
 185                len_olflags0 = vld1q_u64(mbuf0 + 2);
 186                dataoff_iova1  = vld1q_u64(mbuf1);
 187                len_olflags1 = vld1q_u64(mbuf1 + 2);
 188                dataoff_iova2  = vld1q_u64(mbuf2);
 189                len_olflags2 = vld1q_u64(mbuf2 + 2);
 190                dataoff_iova3  = vld1q_u64(mbuf3);
 191                len_olflags3 = vld1q_u64(mbuf3 + 2);
 192
 193                if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
 194                        struct rte_mbuf *mbuf;
 195                        /* Set don't free bit if reference count > 1 */
 196                        xmask01 = vdupq_n_u64(0);
 197                        xmask23 = xmask01;
 198
 199                        mbuf = (struct rte_mbuf *)((uintptr_t)mbuf0 -
 200                                offsetof(struct rte_mbuf, buf_iova));
 201
 202                        if (otx2_nix_prefree_seg(mbuf))
 203                                vsetq_lane_u64(0x80000, xmask01, 0);
 204                        else
 205                                RTE_MEMPOOL_CHECK_COOKIES(mbuf->pool,
 206                                                        (void **)&mbuf,
 207                                                        1, 0);
 208
 209                        mbuf = (struct rte_mbuf *)((uintptr_t)mbuf1 -
 210                                offsetof(struct rte_mbuf, buf_iova));
 211                        if (otx2_nix_prefree_seg(mbuf))
 212                                vsetq_lane_u64(0x80000, xmask01, 1);
 213                        else
 214                                RTE_MEMPOOL_CHECK_COOKIES(mbuf->pool,
 215                                                        (void **)&mbuf,
 216                                                        1, 0);
 217
 218                        mbuf = (struct rte_mbuf *)((uintptr_t)mbuf2 -
 219                                offsetof(struct rte_mbuf, buf_iova));
 220                        if (otx2_nix_prefree_seg(mbuf))
 221                                vsetq_lane_u64(0x80000, xmask23, 0);
 222                        else
 223                                RTE_MEMPOOL_CHECK_COOKIES(mbuf->pool,
 224                                                        (void **)&mbuf,
 225                                                        1, 0);
 226
 227                        mbuf = (struct rte_mbuf *)((uintptr_t)mbuf3 -
 228                                offsetof(struct rte_mbuf, buf_iova));
 229                        if (otx2_nix_prefree_seg(mbuf))
 230                                vsetq_lane_u64(0x80000, xmask23, 1);
 231                        else
 232                                RTE_MEMPOOL_CHECK_COOKIES(mbuf->pool,
 233                                                        (void **)&mbuf,
 234                                                        1, 0);
 235                        senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
 236                        senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
 237                        /* Ensuring mbuf fields which got updated in
 238                         * otx2_nix_prefree_seg are written before LMTST.
 239                         */
 240                        rte_io_wmb();
 241                } else {
 242                        struct rte_mbuf *mbuf;
 243                        /* Mark mempool object as "put" since
 244                         * it is freed by NIX
 245                         */
 246                        mbuf = (struct rte_mbuf *)((uintptr_t)mbuf0 -
 247                                offsetof(struct rte_mbuf, buf_iova));
 248                        RTE_MEMPOOL_CHECK_COOKIES(mbuf->pool, (void **)&mbuf,
 249                                                1, 0);
 250
 251                        mbuf = (struct rte_mbuf *)((uintptr_t)mbuf1 -
 252                                offsetof(struct rte_mbuf, buf_iova));
 253                        RTE_MEMPOOL_CHECK_COOKIES(mbuf->pool, (void **)&mbuf,
 254                                                1, 0);
 255
 256                        mbuf = (struct rte_mbuf *)((uintptr_t)mbuf2 -
 257                                offsetof(struct rte_mbuf, buf_iova));
 258                        RTE_MEMPOOL_CHECK_COOKIES(mbuf->pool, (void **)&mbuf,
 259                                                1, 0);
 260
 261                        mbuf = (struct rte_mbuf *)((uintptr_t)mbuf3 -
 262                                offsetof(struct rte_mbuf, buf_iova));
 263                        RTE_MEMPOOL_CHECK_COOKIES(mbuf->pool, (void **)&mbuf,
 264                                                1, 0);
 265                        RTE_SET_USED(mbuf);
 266                }
 267
 268                /* Move mbufs to point pool */
 269                mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
 270                         offsetof(struct rte_mbuf, pool) -
 271                         offsetof(struct rte_mbuf, buf_iova));
 272                mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
 273                         offsetof(struct rte_mbuf, pool) -
 274                         offsetof(struct rte_mbuf, buf_iova));
 275                mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
 276                         offsetof(struct rte_mbuf, pool) -
 277                         offsetof(struct rte_mbuf, buf_iova));
 278                mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
 279                         offsetof(struct rte_mbuf, pool) -
 280                         offsetof(struct rte_mbuf, buf_iova));
 281
 282                if (flags &
 283                    (NIX_TX_OFFLOAD_OL3_OL4_CSUM_F |
 284                     NIX_TX_OFFLOAD_L3_L4_CSUM_F)) {
 285                        /* Get tx_offload for ol2, ol3, l2, l3 lengths */
 286                        /*
 287                         * E(8):OL2_LEN(7):OL3_LEN(9):E(24):L3_LEN(9):L2_LEN(7)
 288                         * E(8):OL2_LEN(7):OL3_LEN(9):E(24):L3_LEN(9):L2_LEN(7)
 289                         */
 290
 291                        asm volatile ("LD1 {%[a].D}[0],[%[in]]\n\t" :
 292                                      [a]"+w"(senddesc01_w1) :
 293                                      [in]"r"(mbuf0 + 2) : "memory");
 294
 295                        asm volatile ("LD1 {%[a].D}[1],[%[in]]\n\t" :
 296                                      [a]"+w"(senddesc01_w1) :
 297                                      [in]"r"(mbuf1 + 2) : "memory");
 298
 299                        asm volatile ("LD1 {%[b].D}[0],[%[in]]\n\t" :
 300                                      [b]"+w"(senddesc23_w1) :
 301                                      [in]"r"(mbuf2 + 2) : "memory");
 302
 303                        asm volatile ("LD1 {%[b].D}[1],[%[in]]\n\t" :
 304                                      [b]"+w"(senddesc23_w1) :
 305                                      [in]"r"(mbuf3 + 2) : "memory");
 306
 307                        /* Get pool pointer alone */
 308                        mbuf0 = (uint64_t *)*mbuf0;
 309                        mbuf1 = (uint64_t *)*mbuf1;
 310                        mbuf2 = (uint64_t *)*mbuf2;
 311                        mbuf3 = (uint64_t *)*mbuf3;
 312                } else {
 313                        /* Get pool pointer alone */
 314                        mbuf0 = (uint64_t *)*mbuf0;
 315                        mbuf1 = (uint64_t *)*mbuf1;
 316                        mbuf2 = (uint64_t *)*mbuf2;
 317                        mbuf3 = (uint64_t *)*mbuf3;
 318                }
 319
 320                const uint8x16_t shuf_mask2 = {
 321                        0x4, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
 322                        0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
 323                };
 324                xtmp128 = vzip2q_u64(len_olflags0, len_olflags1);
 325                ytmp128 = vzip2q_u64(len_olflags2, len_olflags3);
 326
 327                /* Clear dataoff_iovaX.D[1] bits other than dataoff(15:0) */
 328                const uint64x2_t and_mask0 = {
 329                        0xFFFFFFFFFFFFFFFF,
 330                        0x000000000000FFFF,
 331                };
 332
 333                dataoff_iova0 = vandq_u64(dataoff_iova0, and_mask0);
 334                dataoff_iova1 = vandq_u64(dataoff_iova1, and_mask0);
 335                dataoff_iova2 = vandq_u64(dataoff_iova2, and_mask0);
 336                dataoff_iova3 = vandq_u64(dataoff_iova3, and_mask0);
 337
 338                /*
 339                 * Pick only 16 bits of pktlen preset at bits 63:32
 340                 * and place them at bits 15:0.
 341                 */
 342                xtmp128 = vqtbl1q_u8(xtmp128, shuf_mask2);
 343                ytmp128 = vqtbl1q_u8(ytmp128, shuf_mask2);
 344
 345                /* Add pairwise to get dataoff + iova in sgdesc_w1 */
 346                sgdesc01_w1 = vpaddq_u64(dataoff_iova0, dataoff_iova1);
 347                sgdesc23_w1 = vpaddq_u64(dataoff_iova2, dataoff_iova3);
 348
 349                /* Orr both sgdesc_w0 and senddesc_w0 with 16 bits of
 350                 * pktlen at 15:0 position.
 351                 */
 352                sgdesc01_w0 = vorrq_u64(sgdesc01_w0, xtmp128);
 353                sgdesc23_w0 = vorrq_u64(sgdesc23_w0, ytmp128);
 354                senddesc01_w0 = vorrq_u64(senddesc01_w0, xtmp128);
 355                senddesc23_w0 = vorrq_u64(senddesc23_w0, ytmp128);
 356
 357                if ((flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
 358                    !(flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
 359                        /*
 360                         * Lookup table to translate ol_flags to
 361                         * il3/il4 types. But we still use ol3/ol4 types in
 362                         * senddesc_w1 as only one header processing is enabled.
 363                         */
 364                        const uint8x16_t tbl = {
 365                                /* [0-15] = il4type:il3type */
 366                                0x04, /* none (IPv6 assumed) */
 367                                0x14, /* RTE_MBUF_F_TX_TCP_CKSUM (IPv6 assumed) */
 368                                0x24, /* RTE_MBUF_F_TX_SCTP_CKSUM (IPv6 assumed) */
 369                                0x34, /* RTE_MBUF_F_TX_UDP_CKSUM (IPv6 assumed) */
 370                                0x03, /* RTE_MBUF_F_TX_IP_CKSUM */
 371                                0x13, /* RTE_MBUF_F_TX_IP_CKSUM | RTE_MBUF_F_TX_TCP_CKSUM */
 372                                0x23, /* RTE_MBUF_F_TX_IP_CKSUM | RTE_MBUF_F_TX_SCTP_CKSUM */
 373                                0x33, /* RTE_MBUF_F_TX_IP_CKSUM | RTE_MBUF_F_TX_UDP_CKSUM */
 374                                0x02, /* RTE_MBUF_F_TX_IPV4  */
 375                                0x12, /* RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_TCP_CKSUM */
 376                                0x22, /* RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_SCTP_CKSUM */
 377                                0x32, /* RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_UDP_CKSUM */
 378                                0x03, /* RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_IP_CKSUM */
 379                                0x13, /* RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_IP_CKSUM |
 380                                       * RTE_MBUF_F_TX_TCP_CKSUM
 381                                       */
 382                                0x23, /* RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_IP_CKSUM |
 383                                       * RTE_MBUF_F_TX_SCTP_CKSUM
 384                                       */
 385                                0x33, /* RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_IP_CKSUM |
 386                                       * RTE_MBUF_F_TX_UDP_CKSUM
 387                                       */
 388                        };
 389
 390                        /* Extract olflags to translate to iltypes */
 391                        xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
 392                        ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
 393
 394                        /*
 395                         * E(47):L3_LEN(9):L2_LEN(7+z)
 396                         * E(47):L3_LEN(9):L2_LEN(7+z)
 397                         */
 398                        senddesc01_w1 = vshlq_n_u64(senddesc01_w1, 1);
 399                        senddesc23_w1 = vshlq_n_u64(senddesc23_w1, 1);
 400
 401                        /* Move OLFLAGS bits 55:52 to 51:48
 402                         * with zeros preprended on the byte and rest
 403                         * don't care
 404                         */
 405                        xtmp128 = vshrq_n_u8(xtmp128, 4);
 406                        ytmp128 = vshrq_n_u8(ytmp128, 4);
 407                        /*
 408                         * E(48):L3_LEN(8):L2_LEN(z+7)
 409                         * E(48):L3_LEN(8):L2_LEN(z+7)
 410                         */
 411                        const int8x16_t tshft3 = {
 412                                -1, 0, 8, 8, 8, 8, 8, 8,
 413                                -1, 0, 8, 8, 8, 8, 8, 8,
 414                        };
 415
 416                        senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
 417                        senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
 418
 419                        /* Do the lookup */
 420                        ltypes01 = vqtbl1q_u8(tbl, xtmp128);
 421                        ltypes23 = vqtbl1q_u8(tbl, ytmp128);
 422
 423                        /* Just use ld1q to retrieve aura
 424                         * when we don't need tx_offload
 425                         */
 426                        mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
 427                                        offsetof(struct rte_mempool, pool_id));
 428                        mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
 429                                        offsetof(struct rte_mempool, pool_id));
 430                        mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
 431                                        offsetof(struct rte_mempool, pool_id));
 432                        mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
 433                                        offsetof(struct rte_mempool, pool_id));
 434
 435                        /* Pick only relevant fields i.e Bit 48:55 of iltype
 436                         * and place it in ol3/ol4type of senddesc_w1
 437                         */
 438                        const uint8x16_t shuf_mask0 = {
 439                                0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0xFF, 0xFF, 0xFF,
 440                                0xFF, 0xFF, 0xFF, 0xFF, 0xE, 0xFF, 0xFF, 0xFF,
 441                        };
 442
 443                        ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
 444                        ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
 445
 446                        /* Prepare ol4ptr, ol3ptr from ol3len, ol2len.
 447                         * a [E(32):E(16):OL3(8):OL2(8)]
 448                         * a = a + (a << 8)
 449                         * a [E(32):E(16):(OL3+OL2):OL2]
 450                         * => E(32):E(16)::OL4PTR(8):OL3PTR(8)
 451                         */
 452                        senddesc01_w1 = vaddq_u8(senddesc01_w1,
 453                                                 vshlq_n_u16(senddesc01_w1, 8));
 454                        senddesc23_w1 = vaddq_u8(senddesc23_w1,
 455                                                 vshlq_n_u16(senddesc23_w1, 8));
 456
 457                        /* Create first half of 4W cmd for 4 mbufs (sgdesc) */
 458                        cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
 459                        cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
 460                        cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
 461                        cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
 462
 463                        xmask01 = vdupq_n_u64(0);
 464                        xmask23 = xmask01;
 465                        asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
 466                                [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
 467
 468                        asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
 469                                 [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
 470
 471                        asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
 472                                 [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
 473
 474                        asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
 475                                 [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
 476                        xmask01 = vshlq_n_u64(xmask01, 20);
 477                        xmask23 = vshlq_n_u64(xmask23, 20);
 478
 479                        senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
 480                        senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
 481                        /* Move ltypes to senddesc*_w1 */
 482                        senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
 483                        senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
 484
 485                        /* Create first half of 4W cmd for 4 mbufs (sendhdr) */
 486                        cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
 487                        cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
 488                        cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
 489                        cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
 490
 491                } else if (!(flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
 492                           (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
 493                        /*
 494                         * Lookup table to translate ol_flags to
 495                         * ol3/ol4 types.
 496                         */
 497
 498                        const uint8x16_t tbl = {
 499                                /* [0-15] = ol4type:ol3type */
 500                                0x00, /* none */
 501                                0x03, /* OUTER_IP_CKSUM */
 502                                0x02, /* OUTER_IPV4 */
 503                                0x03, /* OUTER_IPV4 | OUTER_IP_CKSUM */
 504                                0x04, /* OUTER_IPV6 */
 505                                0x00, /* OUTER_IPV6 | OUTER_IP_CKSUM */
 506                                0x00, /* OUTER_IPV6 | OUTER_IPV4 */
 507                                0x00, /* OUTER_IPV6 | OUTER_IPV4 |
 508                                       * OUTER_IP_CKSUM
 509                                       */
 510                                0x00, /* OUTER_UDP_CKSUM */
 511                                0x33, /* OUTER_UDP_CKSUM | OUTER_IP_CKSUM */
 512                                0x32, /* OUTER_UDP_CKSUM | OUTER_IPV4 */
 513                                0x33, /* OUTER_UDP_CKSUM | OUTER_IPV4 |
 514                                       * OUTER_IP_CKSUM
 515                                       */
 516                                0x34, /* OUTER_UDP_CKSUM | OUTER_IPV6 */
 517                                0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
 518                                       * OUTER_IP_CKSUM
 519                                       */
 520                                0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
 521                                       * OUTER_IPV4
 522                                       */
 523                                0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
 524                                       * OUTER_IPV4 | OUTER_IP_CKSUM
 525                                       */
 526                        };
 527
 528                        /* Extract olflags to translate to iltypes */
 529                        xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
 530                        ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
 531
 532                        /*
 533                         * E(47):OL3_LEN(9):OL2_LEN(7+z)
 534                         * E(47):OL3_LEN(9):OL2_LEN(7+z)
 535                         */
 536                        const uint8x16_t shuf_mask5 = {
 537                                0x6, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
 538                                0xE, 0xD, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
 539                        };
 540                        senddesc01_w1 = vqtbl1q_u8(senddesc01_w1, shuf_mask5);
 541                        senddesc23_w1 = vqtbl1q_u8(senddesc23_w1, shuf_mask5);
 542
 543                        /* Extract outer ol flags only */
 544                        const uint64x2_t o_cksum_mask = {
 545                                0x1C00020000000000,
 546                                0x1C00020000000000,
 547                        };
 548
 549                        xtmp128 = vandq_u64(xtmp128, o_cksum_mask);
 550                        ytmp128 = vandq_u64(ytmp128, o_cksum_mask);
 551
 552                        /* Extract OUTER_UDP_CKSUM bit 41 and
 553                         * move it to bit 61
 554                         */
 555
 556                        xtmp128 = xtmp128 | vshlq_n_u64(xtmp128, 20);
 557                        ytmp128 = ytmp128 | vshlq_n_u64(ytmp128, 20);
 558
 559                        /* Shift oltype by 2 to start nibble from BIT(56)
 560                         * instead of BIT(58)
 561                         */
 562                        xtmp128 = vshrq_n_u8(xtmp128, 2);
 563                        ytmp128 = vshrq_n_u8(ytmp128, 2);
 564                        /*
 565                         * E(48):L3_LEN(8):L2_LEN(z+7)
 566                         * E(48):L3_LEN(8):L2_LEN(z+7)
 567                         */
 568                        const int8x16_t tshft3 = {
 569                                -1, 0, 8, 8, 8, 8, 8, 8,
 570                                -1, 0, 8, 8, 8, 8, 8, 8,
 571                        };
 572
 573                        senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
 574                        senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
 575
 576                        /* Do the lookup */
 577                        ltypes01 = vqtbl1q_u8(tbl, xtmp128);
 578                        ltypes23 = vqtbl1q_u8(tbl, ytmp128);
 579
 580                        /* Just use ld1q to retrieve aura
 581                         * when we don't need tx_offload
 582                         */
 583                        mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
 584                                        offsetof(struct rte_mempool, pool_id));
 585                        mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
 586                                        offsetof(struct rte_mempool, pool_id));
 587                        mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
 588                                        offsetof(struct rte_mempool, pool_id));
 589                        mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
 590                                        offsetof(struct rte_mempool, pool_id));
 591
 592                        /* Pick only relevant fields i.e Bit 56:63 of oltype
 593                         * and place it in ol3/ol4type of senddesc_w1
 594                         */
 595                        const uint8x16_t shuf_mask0 = {
 596                                0xFF, 0xFF, 0xFF, 0xFF, 0x7, 0xFF, 0xFF, 0xFF,
 597                                0xFF, 0xFF, 0xFF, 0xFF, 0xF, 0xFF, 0xFF, 0xFF,
 598                        };
 599
 600                        ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
 601                        ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
 602
 603                        /* Prepare ol4ptr, ol3ptr from ol3len, ol2len.
 604                         * a [E(32):E(16):OL3(8):OL2(8)]
 605                         * a = a + (a << 8)
 606                         * a [E(32):E(16):(OL3+OL2):OL2]
 607                         * => E(32):E(16)::OL4PTR(8):OL3PTR(8)
 608                         */
 609                        senddesc01_w1 = vaddq_u8(senddesc01_w1,
 610                                                 vshlq_n_u16(senddesc01_w1, 8));
 611                        senddesc23_w1 = vaddq_u8(senddesc23_w1,
 612                                                 vshlq_n_u16(senddesc23_w1, 8));
 613
 614                        /* Create second half of 4W cmd for 4 mbufs (sgdesc) */
 615                        cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
 616                        cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
 617                        cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
 618                        cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
 619
 620                        xmask01 = vdupq_n_u64(0);
 621                        xmask23 = xmask01;
 622                        asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
 623                                 [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
 624
 625                        asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
 626                                 [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
 627
 628                        asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
 629                                 [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
 630
 631                        asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
 632                                 [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
 633                        xmask01 = vshlq_n_u64(xmask01, 20);
 634                        xmask23 = vshlq_n_u64(xmask23, 20);
 635
 636                        senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
 637                        senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
 638                        /* Move ltypes to senddesc*_w1 */
 639                        senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
 640                        senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
 641
 642                        /* Create first half of 4W cmd for 4 mbufs (sendhdr) */
 643                        cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
 644                        cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
 645                        cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
 646                        cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
 647
 648                } else if ((flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
 649                           (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
 650                        /* Lookup table to translate ol_flags to
 651                         * ol4type, ol3type, il4type, il3type of senddesc_w1
 652                         */
 653                        const uint8x16x2_t tbl = {
 654                        {
 655                                {
 656                                        /* [0-15] = il4type:il3type */
 657                                        0x04, /* none (IPv6) */
 658                                        0x14, /* RTE_MBUF_F_TX_TCP_CKSUM (IPv6) */
 659                                        0x24, /* RTE_MBUF_F_TX_SCTP_CKSUM (IPv6) */
 660                                        0x34, /* RTE_MBUF_F_TX_UDP_CKSUM (IPv6) */
 661                                        0x03, /* RTE_MBUF_F_TX_IP_CKSUM */
 662                                        0x13, /* RTE_MBUF_F_TX_IP_CKSUM |
 663                                               * RTE_MBUF_F_TX_TCP_CKSUM
 664                                               */
 665                                        0x23, /* RTE_MBUF_F_TX_IP_CKSUM |
 666                                               * RTE_MBUF_F_TX_SCTP_CKSUM
 667                                               */
 668                                        0x33, /* RTE_MBUF_F_TX_IP_CKSUM |
 669                                               * RTE_MBUF_F_TX_UDP_CKSUM
 670                                               */
 671                                        0x02, /* RTE_MBUF_F_TX_IPV4 */
 672                                        0x12, /* RTE_MBUF_F_TX_IPV4 |
 673                                               * RTE_MBUF_F_TX_TCP_CKSUM
 674                                               */
 675                                        0x22, /* RTE_MBUF_F_TX_IPV4 |
 676                                               * RTE_MBUF_F_TX_SCTP_CKSUM
 677                                               */
 678                                        0x32, /* RTE_MBUF_F_TX_IPV4 |
 679                                               * RTE_MBUF_F_TX_UDP_CKSUM
 680                                               */
 681                                        0x03, /* RTE_MBUF_F_TX_IPV4 |
 682                                               * RTE_MBUF_F_TX_IP_CKSUM
 683                                               */
 684                                        0x13, /* RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_IP_CKSUM |
 685                                               * RTE_MBUF_F_TX_TCP_CKSUM
 686                                               */
 687                                        0x23, /* RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_IP_CKSUM |
 688                                               * RTE_MBUF_F_TX_SCTP_CKSUM
 689                                               */
 690                                        0x33, /* RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_IP_CKSUM |
 691                                               * RTE_MBUF_F_TX_UDP_CKSUM
 692                                               */
 693                                },
 694
 695                                {
 696                                        /* [16-31] = ol4type:ol3type */
 697                                        0x00, /* none */
 698                                        0x03, /* OUTER_IP_CKSUM */
 699                                        0x02, /* OUTER_IPV4 */
 700                                        0x03, /* OUTER_IPV4 | OUTER_IP_CKSUM */
 701                                        0x04, /* OUTER_IPV6 */
 702                                        0x00, /* OUTER_IPV6 | OUTER_IP_CKSUM */
 703                                        0x00, /* OUTER_IPV6 | OUTER_IPV4 */
 704                                        0x00, /* OUTER_IPV6 | OUTER_IPV4 |
 705                                               * OUTER_IP_CKSUM
 706                                               */
 707                                        0x00, /* OUTER_UDP_CKSUM */
 708                                        0x33, /* OUTER_UDP_CKSUM |
 709                                               * OUTER_IP_CKSUM
 710                                               */
 711                                        0x32, /* OUTER_UDP_CKSUM |
 712                                               * OUTER_IPV4
 713                                               */
 714                                        0x33, /* OUTER_UDP_CKSUM |
 715                                               * OUTER_IPV4 | OUTER_IP_CKSUM
 716                                               */
 717                                        0x34, /* OUTER_UDP_CKSUM |
 718                                               * OUTER_IPV6
 719                                               */
 720                                        0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
 721                                               * OUTER_IP_CKSUM
 722                                               */
 723                                        0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
 724                                               * OUTER_IPV4
 725                                               */
 726                                        0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
 727                                               * OUTER_IPV4 | OUTER_IP_CKSUM
 728                                               */
 729                                },
 730                        }
 731                        };
 732
 733                        /* Extract olflags to translate to oltype & iltype */
 734                        xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
 735                        ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
 736
 737                        /*
 738                         * E(8):OL2_LN(7):OL3_LN(9):E(23):L3_LN(9):L2_LN(7+z)
 739                         * E(8):OL2_LN(7):OL3_LN(9):E(23):L3_LN(9):L2_LN(7+z)
 740                         */
 741                        const uint32x4_t tshft_4 = {
 742                                1, 0,
 743                                1, 0,
 744                        };
 745                        senddesc01_w1 = vshlq_u32(senddesc01_w1, tshft_4);
 746                        senddesc23_w1 = vshlq_u32(senddesc23_w1, tshft_4);
 747
 748                        /*
 749                         * E(32):L3_LEN(8):L2_LEN(7+Z):OL3_LEN(8):OL2_LEN(7+Z)
 750                         * E(32):L3_LEN(8):L2_LEN(7+Z):OL3_LEN(8):OL2_LEN(7+Z)
 751                         */
 752                        const uint8x16_t shuf_mask5 = {
 753                                0x6, 0x5, 0x0, 0x1, 0xFF, 0xFF, 0xFF, 0xFF,
 754                                0xE, 0xD, 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF,
 755                        };
 756                        senddesc01_w1 = vqtbl1q_u8(senddesc01_w1, shuf_mask5);
 757                        senddesc23_w1 = vqtbl1q_u8(senddesc23_w1, shuf_mask5);
 758
 759                        /* Extract outer and inner header ol_flags */
 760                        const uint64x2_t oi_cksum_mask = {
 761                                0x1CF0020000000000,
 762                                0x1CF0020000000000,
 763                        };
 764
 765                        xtmp128 = vandq_u64(xtmp128, oi_cksum_mask);
 766                        ytmp128 = vandq_u64(ytmp128, oi_cksum_mask);
 767
 768                        /* Extract OUTER_UDP_CKSUM bit 41 and
 769                         * move it to bit 61
 770                         */
 771
 772                        xtmp128 = xtmp128 | vshlq_n_u64(xtmp128, 20);
 773                        ytmp128 = ytmp128 | vshlq_n_u64(ytmp128, 20);
 774
 775                        /* Shift right oltype by 2 and iltype by 4
 776                         * to start oltype nibble from BIT(58)
 777                         * instead of BIT(56) and iltype nibble from BIT(48)
 778                         * instead of BIT(52).
 779                         */
 780                        const int8x16_t tshft5 = {
 781                                8, 8, 8, 8, 8, 8, -4, -2,
 782                                8, 8, 8, 8, 8, 8, -4, -2,
 783                        };
 784
 785                        xtmp128 = vshlq_u8(xtmp128, tshft5);
 786                        ytmp128 = vshlq_u8(ytmp128, tshft5);
 787                        /*
 788                         * E(32):L3_LEN(8):L2_LEN(8):OL3_LEN(8):OL2_LEN(8)
 789                         * E(32):L3_LEN(8):L2_LEN(8):OL3_LEN(8):OL2_LEN(8)
 790                         */
 791                        const int8x16_t tshft3 = {
 792                                -1, 0, -1, 0, 0, 0, 0, 0,
 793                                -1, 0, -1, 0, 0, 0, 0, 0,
 794                        };
 795
 796                        senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
 797                        senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
 798
 799                        /* Mark Bit(4) of oltype */
 800                        const uint64x2_t oi_cksum_mask2 = {
 801                                0x1000000000000000,
 802                                0x1000000000000000,
 803                        };
 804
 805                        xtmp128 = vorrq_u64(xtmp128, oi_cksum_mask2);
 806                        ytmp128 = vorrq_u64(ytmp128, oi_cksum_mask2);
 807
 808                        /* Do the lookup */
 809                        ltypes01 = vqtbl2q_u8(tbl, xtmp128);
 810                        ltypes23 = vqtbl2q_u8(tbl, ytmp128);
 811
 812                        /* Just use ld1q to retrieve aura
 813                         * when we don't need tx_offload
 814                         */
 815                        mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
 816                                        offsetof(struct rte_mempool, pool_id));
 817                        mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
 818                                        offsetof(struct rte_mempool, pool_id));
 819                        mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
 820                                        offsetof(struct rte_mempool, pool_id));
 821                        mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
 822                                        offsetof(struct rte_mempool, pool_id));
 823
 824                        /* Pick only relevant fields i.e Bit 48:55 of iltype and
 825                         * Bit 56:63 of oltype and place it in corresponding
 826                         * place in senddesc_w1.
 827                         */
 828                        const uint8x16_t shuf_mask0 = {
 829                                0xFF, 0xFF, 0xFF, 0xFF, 0x7, 0x6, 0xFF, 0xFF,
 830                                0xFF, 0xFF, 0xFF, 0xFF, 0xF, 0xE, 0xFF, 0xFF,
 831                        };
 832
 833                        ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
 834                        ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
 835
 836                        /* Prepare l4ptr, l3ptr, ol4ptr, ol3ptr from
 837                         * l3len, l2len, ol3len, ol2len.
 838                         * a [E(32):L3(8):L2(8):OL3(8):OL2(8)]
 839                         * a = a + (a << 8)
 840                         * a [E:(L3+L2):(L2+OL3):(OL3+OL2):OL2]
 841                         * a = a + (a << 16)
 842                         * a [E:(L3+L2+OL3+OL2):(L2+OL3+OL2):(OL3+OL2):OL2]
 843                         * => E(32):IL4PTR(8):IL3PTR(8):OL4PTR(8):OL3PTR(8)
 844                         */
 845                        senddesc01_w1 = vaddq_u8(senddesc01_w1,
 846                                                 vshlq_n_u32(senddesc01_w1, 8));
 847                        senddesc23_w1 = vaddq_u8(senddesc23_w1,
 848                                                 vshlq_n_u32(senddesc23_w1, 8));
 849
 850                        /* Create second half of 4W cmd for 4 mbufs (sgdesc) */
 851                        cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
 852                        cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
 853                        cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
 854                        cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
 855
 856                        /* Continue preparing l4ptr, l3ptr, ol4ptr, ol3ptr */
 857                        senddesc01_w1 = vaddq_u8(senddesc01_w1,
 858                                                vshlq_n_u32(senddesc01_w1, 16));
 859                        senddesc23_w1 = vaddq_u8(senddesc23_w1,
 860                                                vshlq_n_u32(senddesc23_w1, 16));
 861
 862                        xmask01 = vdupq_n_u64(0);
 863                        xmask23 = xmask01;
 864                        asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
 865                                 [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
 866
 867                        asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
 868                                 [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
 869
 870                        asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
 871                                 [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
 872
 873                        asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
 874                                 [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
 875                        xmask01 = vshlq_n_u64(xmask01, 20);
 876                        xmask23 = vshlq_n_u64(xmask23, 20);
 877
 878                        senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
 879                        senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
 880                        /* Move ltypes to senddesc*_w1 */
 881                        senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
 882                        senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
 883
 884                        /* Create first half of 4W cmd for 4 mbufs (sendhdr) */
 885                        cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
 886                        cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
 887                        cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
 888                        cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
 889                } else {
 890                        /* Just use ld1q to retrieve aura
 891                         * when we don't need tx_offload
 892                         */
 893                        mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
 894                                        offsetof(struct rte_mempool, pool_id));
 895                        mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
 896                                        offsetof(struct rte_mempool, pool_id));
 897                        mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
 898                                        offsetof(struct rte_mempool, pool_id));
 899                        mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
 900                                        offsetof(struct rte_mempool, pool_id));
 901                        xmask01 = vdupq_n_u64(0);
 902                        xmask23 = xmask01;
 903                        asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
 904                                 [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
 905
 906                        asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
 907                                 [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
 908
 909                        asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
 910                                 [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
 911
 912                        asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
 913                                 [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
 914                        xmask01 = vshlq_n_u64(xmask01, 20);
 915                        xmask23 = vshlq_n_u64(xmask23, 20);
 916
 917                        senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
 918                        senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
 919
 920                        /* Create 4W cmd for 4 mbufs (sendhdr, sgdesc) */
 921                        cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
 922                        cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
 923                        cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
 924                        cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
 925                        cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
 926                        cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
 927                        cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
 928                        cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
 929                }
 930
 931                do {
 932                        vst1q_u64(lmt_addr, cmd00);
 933                        vst1q_u64(lmt_addr + 2, cmd01);
 934                        vst1q_u64(lmt_addr + 4, cmd10);
 935                        vst1q_u64(lmt_addr + 6, cmd11);
 936                        vst1q_u64(lmt_addr + 8, cmd20);
 937                        vst1q_u64(lmt_addr + 10, cmd21);
 938                        vst1q_u64(lmt_addr + 12, cmd30);
 939                        vst1q_u64(lmt_addr + 14, cmd31);
 940                        lmt_status = otx2_lmt_submit(io_addr);
 941
 942                } while (lmt_status == 0);
 943                tx_pkts = tx_pkts + NIX_DESCS_PER_LOOP;
 944        }
 945
 946        if (unlikely(pkts_left))
 947                pkts += nix_xmit_pkts(tx_queue, tx_pkts, pkts_left, cmd, flags);
 948
 949        return pkts;
 950}
 951
 952#else
 953static __rte_always_inline uint16_t
 954nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
 955                     uint16_t pkts, uint64_t *cmd, const uint16_t flags)
 956{
 957        RTE_SET_USED(tx_queue);
 958        RTE_SET_USED(tx_pkts);
 959        RTE_SET_USED(pkts);
 960        RTE_SET_USED(cmd);
 961        RTE_SET_USED(flags);
 962        return 0;
 963}
 964#endif
 965
 966#define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags)                  \
 967static uint16_t __rte_noinline  __rte_hot                                       \
 968otx2_nix_xmit_pkts_ ## name(void *tx_queue,                             \
 969                        struct rte_mbuf **tx_pkts, uint16_t pkts)       \
 970{                                                                       \
 971        uint64_t cmd[sz];                                               \
 972                                                                        \
 973        /* For TSO inner checksum is a must */                          \
 974        if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&                         \
 975            !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                   \
 976                return 0;                                               \
 977        return nix_xmit_pkts(tx_queue, tx_pkts, pkts, cmd, flags);      \
 978}
 979
 980NIX_TX_FASTPATH_MODES
 981#undef T
 982
 983#define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags)                  \
 984static uint16_t __rte_noinline  __rte_hot                                       \
 985otx2_nix_xmit_pkts_mseg_ ## name(void *tx_queue,                        \
 986                        struct rte_mbuf **tx_pkts, uint16_t pkts)       \
 987{                                                                       \
 988        uint64_t cmd[(sz) + NIX_TX_MSEG_SG_DWORDS - 2];                 \
 989                                                                        \
 990        /* For TSO inner checksum is a must */                          \
 991        if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&                         \
 992            !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                   \
 993                return 0;                                               \
 994        return nix_xmit_pkts_mseg(tx_queue, tx_pkts, pkts, cmd,         \
 995                                  (flags) | NIX_TX_MULTI_SEG_F);        \
 996}
 997
 998NIX_TX_FASTPATH_MODES
 999#undef T
1000
1001#define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags)                  \
1002static uint16_t __rte_noinline  __rte_hot                                       \
1003otx2_nix_xmit_pkts_vec_ ## name(void *tx_queue,                         \
1004                        struct rte_mbuf **tx_pkts, uint16_t pkts)       \
1005{                                                                       \
1006        uint64_t cmd[sz];                                               \
1007                                                                        \
1008        /* VLAN, TSTMP, TSO is not supported by vec */                  \
1009        if ((flags) & NIX_TX_OFFLOAD_VLAN_QINQ_F ||                     \
1010            (flags) & NIX_TX_OFFLOAD_TSTAMP_F ||                        \
1011            (flags) & NIX_TX_OFFLOAD_TSO_F)                             \
1012                return 0;                                               \
1013        return nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd, (flags)); \
1014}
1015
1016NIX_TX_FASTPATH_MODES
1017#undef T
1018
1019static inline void
1020pick_tx_func(struct rte_eth_dev *eth_dev,
1021             const eth_tx_burst_t tx_burst[2][2][2][2][2][2][2])
1022{
1023        struct otx2_eth_dev *dev = otx2_eth_pmd_priv(eth_dev);
1024
1025        /* [SEC] [TSTMP] [NOFF] [VLAN] [OL3_OL4_CSUM] [IL3_IL4_CSUM] */
1026        eth_dev->tx_pkt_burst = tx_burst
1027                [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_SECURITY_F)]
1028                [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_TSO_F)]
1029                [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_TSTAMP_F)]
1030                [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)]
1031                [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_VLAN_QINQ_F)]
1032                [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)]
1033                [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F)];
1034}
1035
1036void
1037otx2_eth_set_tx_function(struct rte_eth_dev *eth_dev)
1038{
1039        struct otx2_eth_dev *dev = otx2_eth_pmd_priv(eth_dev);
1040
1041        const eth_tx_burst_t nix_eth_tx_burst[2][2][2][2][2][2][2] = {
1042#define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags)                  \
1043        [f6][f5][f4][f3][f2][f1][f0] =  otx2_nix_xmit_pkts_ ## name,
1044
1045NIX_TX_FASTPATH_MODES
1046#undef T
1047        };
1048
1049        const eth_tx_burst_t nix_eth_tx_burst_mseg[2][2][2][2][2][2][2] = {
1050#define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags)                  \
1051        [f6][f5][f4][f3][f2][f1][f0] =  otx2_nix_xmit_pkts_mseg_ ## name,
1052
1053NIX_TX_FASTPATH_MODES
1054#undef T
1055        };
1056
1057        const eth_tx_burst_t nix_eth_tx_vec_burst[2][2][2][2][2][2][2] = {
1058#define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags)                  \
1059        [f6][f5][f4][f3][f2][f1][f0] =  otx2_nix_xmit_pkts_vec_ ## name,
1060
1061NIX_TX_FASTPATH_MODES
1062#undef T
1063        };
1064
1065        if (dev->scalar_ena ||
1066            (dev->tx_offload_flags &
1067             (NIX_TX_OFFLOAD_VLAN_QINQ_F | NIX_TX_OFFLOAD_TSTAMP_F |
1068              NIX_TX_OFFLOAD_TSO_F)))
1069                pick_tx_func(eth_dev, nix_eth_tx_burst);
1070        else
1071                pick_tx_func(eth_dev, nix_eth_tx_vec_burst);
1072
1073        if (dev->tx_offloads & RTE_ETH_TX_OFFLOAD_MULTI_SEGS)
1074                pick_tx_func(eth_dev, nix_eth_tx_burst_mseg);
1075
1076        rte_mb();
1077}
1078