dpdk/drivers/net/ice/ice_rxtx_vec_avx512.c
<<
>>
Prefs
   1/* SPDX-License-Identifier: BSD-3-Clause
   2 * Copyright(c) 2019 Intel Corporation
   3 */
   4
   5#include "ice_rxtx_vec_common.h"
   6
   7#include <x86intrin.h>
   8
   9#ifndef __INTEL_COMPILER
  10#pragma GCC diagnostic ignored "-Wcast-qual"
  11#endif
  12
  13#define ICE_DESCS_PER_LOOP_AVX 8
  14
  15static inline void
  16ice_rxq_rearm(struct ice_rx_queue *rxq)
  17{
  18        int i;
  19        uint16_t rx_id;
  20        volatile union ice_rx_flex_desc *rxdp;
  21        struct ice_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];
  22        struct rte_mempool_cache *cache = rte_mempool_default_cache(rxq->mp,
  23                        rte_lcore_id());
  24
  25        rxdp = rxq->rx_ring + rxq->rxrearm_start;
  26
  27        /* We need to pull 'n' more MBUFs into the software ring */
  28        if (cache->len < ICE_RXQ_REARM_THRESH) {
  29                uint32_t req = ICE_RXQ_REARM_THRESH + (cache->size -
  30                                cache->len);
  31
  32                int ret = rte_mempool_ops_dequeue_bulk(rxq->mp,
  33                                &cache->objs[cache->len], req);
  34                if (ret == 0) {
  35                        cache->len += req;
  36                } else {
  37                        if (rxq->rxrearm_nb + ICE_RXQ_REARM_THRESH >=
  38                            rxq->nb_rx_desc) {
  39                                __m128i dma_addr0;
  40
  41                                dma_addr0 = _mm_setzero_si128();
  42                                for (i = 0; i < ICE_DESCS_PER_LOOP; i++) {
  43                                        rxep[i].mbuf = &rxq->fake_mbuf;
  44                                        _mm_store_si128
  45                                                ((__m128i *)&rxdp[i].read,
  46                                                        dma_addr0);
  47                                }
  48                        }
  49                        rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
  50                                ICE_RXQ_REARM_THRESH;
  51                        return;
  52                }
  53        }
  54
  55        const __m512i iova_offsets =  _mm512_set1_epi64
  56                (offsetof(struct rte_mbuf, buf_iova));
  57        const __m512i headroom = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
  58
  59#ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC
  60        /* shuffle the iova into correct slots. Values 4-7 will contain
  61         * zeros, so use 7 for a zero-value.
  62         */
  63        const __m512i permute_idx = _mm512_set_epi64(7, 7, 3, 1, 7, 7, 2, 0);
  64#else
  65        const __m512i permute_idx = _mm512_set_epi64(7, 3, 6, 2, 5, 1, 4, 0);
  66#endif
  67
  68        /* fill up the rxd in vector, process 8 mbufs in one loop */
  69        for (i = 0; i < ICE_RXQ_REARM_THRESH / 8; i++) {
  70                const __m512i mbuf_ptrs = _mm512_loadu_si512
  71                        (&cache->objs[cache->len - 8]);
  72                _mm512_store_si512(rxep, mbuf_ptrs);
  73
  74                /* gather iova of mbuf0-7 into one zmm reg */
  75                const __m512i iova_base_addrs = _mm512_i64gather_epi64
  76                        (_mm512_add_epi64(mbuf_ptrs, iova_offsets),
  77                                0, /* base */
  78                                1  /* scale */);
  79                const __m512i iova_addrs = _mm512_add_epi64(iova_base_addrs,
  80                                headroom);
  81#ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC
  82                const __m512i iovas0 = _mm512_castsi256_si512
  83                        (_mm512_extracti64x4_epi64(iova_addrs, 0));
  84                const __m512i iovas1 = _mm512_castsi256_si512
  85                        (_mm512_extracti64x4_epi64(iova_addrs, 1));
  86
  87                /* permute leaves iova 2-3 in hdr_addr of desc 0-1
  88                 * but these are ignored by driver since header split not
  89                 * enabled. Similarly for desc 4 & 5.
  90                 */
  91                const __m512i desc0_1 = _mm512_permutexvar_epi64
  92                        (permute_idx, iovas0);
  93                const __m512i desc2_3 = _mm512_bsrli_epi128(desc0_1, 8);
  94
  95                const __m512i desc4_5 = _mm512_permutexvar_epi64
  96                        (permute_idx, iovas1);
  97                const __m512i desc6_7 = _mm512_bsrli_epi128(desc4_5, 8);
  98
  99                _mm512_store_si512((void *)rxdp, desc0_1);
 100                _mm512_store_si512((void *)(rxdp + 2), desc2_3);
 101                _mm512_store_si512((void *)(rxdp + 4), desc4_5);
 102                _mm512_store_si512((void *)(rxdp + 6), desc6_7);
 103#else
 104                /* permute leaves iova 4-7 in hdr_addr of desc 0-3
 105                 * but these are ignored by driver since header split not
 106                 * enabled.
 107                 */
 108                const __m512i desc0_3 = _mm512_permutexvar_epi64
 109                        (permute_idx, iova_addrs);
 110                const __m512i desc4_7 = _mm512_bsrli_epi128(desc0_3, 8);
 111
 112                _mm512_store_si512((void *)rxdp, desc0_3);
 113                _mm512_store_si512((void *)(rxdp + 4), desc4_7);
 114#endif
 115                rxep += 8, rxdp += 8, cache->len -= 8;
 116        }
 117
 118        rxq->rxrearm_start += ICE_RXQ_REARM_THRESH;
 119        if (rxq->rxrearm_start >= rxq->nb_rx_desc)
 120                rxq->rxrearm_start = 0;
 121
 122        rxq->rxrearm_nb -= ICE_RXQ_REARM_THRESH;
 123
 124        rx_id = (uint16_t)((rxq->rxrearm_start == 0) ?
 125                             (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
 126
 127        /* Update the tail pointer on the NIC */
 128        ICE_PCI_REG_WRITE(rxq->qrx_tail, rx_id);
 129}
 130
 131static inline __m256i
 132ice_flex_rxd_to_fdir_flags_vec_avx512(const __m256i fdir_id0_7)
 133{
 134#define FDID_MIS_MAGIC 0xFFFFFFFF
 135        RTE_BUILD_BUG_ON(PKT_RX_FDIR != (1 << 2));
 136        RTE_BUILD_BUG_ON(PKT_RX_FDIR_ID != (1 << 13));
 137        const __m256i pkt_fdir_bit = _mm256_set1_epi32(PKT_RX_FDIR |
 138                        PKT_RX_FDIR_ID);
 139        /* desc->flow_id field == 0xFFFFFFFF means fdir mismatch */
 140        const __m256i fdir_mis_mask = _mm256_set1_epi32(FDID_MIS_MAGIC);
 141        __m256i fdir_mask = _mm256_cmpeq_epi32(fdir_id0_7,
 142                        fdir_mis_mask);
 143        /* this XOR op results to bit-reverse the fdir_mask */
 144        fdir_mask = _mm256_xor_si256(fdir_mask, fdir_mis_mask);
 145        const __m256i fdir_flags = _mm256_and_si256(fdir_mask, pkt_fdir_bit);
 146
 147        return fdir_flags;
 148}
 149
 150static inline uint16_t
 151_ice_recv_raw_pkts_vec_avx512(struct ice_rx_queue *rxq,
 152                              struct rte_mbuf **rx_pkts,
 153                              uint16_t nb_pkts, uint8_t *split_packet)
 154{
 155        const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 156        const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
 157                        0, rxq->mbuf_initializer);
 158        struct ice_rx_entry *sw_ring = &rxq->sw_ring[rxq->rx_tail];
 159        volatile union ice_rx_flex_desc *rxdp = rxq->rx_ring + rxq->rx_tail;
 160
 161        rte_prefetch0(rxdp);
 162
 163        /* nb_pkts has to be floor-aligned to ICE_DESCS_PER_LOOP_AVX */
 164        nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, ICE_DESCS_PER_LOOP_AVX);
 165
 166        /* See if we need to rearm the RX queue - gives the prefetch a bit
 167         * of time to act
 168         */
 169        if (rxq->rxrearm_nb > ICE_RXQ_REARM_THRESH)
 170                ice_rxq_rearm(rxq);
 171
 172        /* Before we start moving massive data around, check to see if
 173         * there is actually a packet available
 174         */
 175        if (!(rxdp->wb.status_error0 &
 176                        rte_cpu_to_le_32(1 << ICE_RX_FLEX_DESC_STATUS0_DD_S)))
 177                return 0;
 178
 179        /* constants used in processing loop */
 180        const __m512i crc_adjust =
 181                _mm512_set4_epi32
 182                        (0,             /* ignore non-length fields */
 183                         -rxq->crc_len, /* sub crc on data_len */
 184                         -rxq->crc_len, /* sub crc on pkt_len */
 185                         0              /* ignore non-length fields */
 186                        );
 187
 188        /* 8 packets DD mask, LSB in each 32-bit value */
 189        const __m256i dd_check = _mm256_set1_epi32(1);
 190
 191        /* 8 packets EOP mask, second-LSB in each 32-bit value */
 192        const __m256i eop_check = _mm256_slli_epi32(dd_check,
 193                        ICE_RX_DESC_STATUS_EOF_S);
 194
 195        /* mask to shuffle from desc. to mbuf (4 descriptors)*/
 196        const __m512i shuf_msk =
 197                _mm512_set4_epi32
 198                        (/* rss hash parsed separately */
 199                         0xFFFFFFFF,
 200                         /* octet 10~11, 16 bits vlan_macip */
 201                         /* octet 4~5, 16 bits data_len */
 202                         11 << 24 | 10 << 16 | 5 << 8 | 4,
 203                         /* skip hi 16 bits pkt_len, zero out */
 204                         /* octet 4~5, 16 bits pkt_len */
 205                         0xFFFF << 16 | 5 << 8 | 4,
 206                         /* pkt_type set as unknown */
 207                         0xFFFFFFFF
 208                        );
 209
 210        /**
 211         * compile-time check the above crc and shuffle layout is correct.
 212         * NOTE: the first field (lowest address) is given last in set_epi
 213         * calls above.
 214         */
 215        RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
 216                        offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
 217        RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
 218                        offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
 219        RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
 220                        offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
 221        RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
 222                        offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
 223
 224        /* Status/Error flag masks */
 225        /**
 226         * mask everything except Checksum Reports, RSS indication
 227         * and VLAN indication.
 228         * bit6:4 for IP/L4 checksum errors.
 229         * bit12 is for RSS indication.
 230         * bit13 is for VLAN indication.
 231         */
 232        const __m256i flags_mask =
 233                 _mm256_set1_epi32((7 << 4) | (1 << 12) | (1 << 13));
 234        /**
 235         * data to be shuffled by the result of the flags mask shifted by 4
 236         * bits.  This gives use the l3_l4 flags.
 237         */
 238        const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
 239                        /* shift right 1 bit to make sure it not exceed 255 */
 240                        (PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
 241                         PKT_RX_IP_CKSUM_BAD) >> 1,
 242                        (PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
 243                         PKT_RX_IP_CKSUM_GOOD) >> 1,
 244                        (PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
 245                         PKT_RX_IP_CKSUM_BAD) >> 1,
 246                        (PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
 247                         PKT_RX_IP_CKSUM_GOOD) >> 1,
 248                        (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
 249                        (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1,
 250                        (PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1,
 251                        (PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1,
 252                        /* 2nd 128-bits */
 253                        0, 0, 0, 0, 0, 0, 0, 0,
 254                        (PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
 255                         PKT_RX_IP_CKSUM_BAD) >> 1,
 256                        (PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
 257                         PKT_RX_IP_CKSUM_GOOD) >> 1,
 258                        (PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
 259                         PKT_RX_IP_CKSUM_BAD) >> 1,
 260                        (PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
 261                         PKT_RX_IP_CKSUM_GOOD) >> 1,
 262                        (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
 263                        (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1,
 264                        (PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1,
 265                        (PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1);
 266        const __m256i cksum_mask =
 267                 _mm256_set1_epi32(PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
 268                                   PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
 269                                   PKT_RX_EIP_CKSUM_BAD);
 270        /**
 271         * data to be shuffled by result of flag mask, shifted down 12.
 272         * If RSS(bit12)/VLAN(bit13) are set,
 273         * shuffle moves appropriate flags in place.
 274         */
 275        const __m256i rss_vlan_flags_shuf = _mm256_set_epi8(0, 0, 0, 0,
 276                        0, 0, 0, 0,
 277                        0, 0, 0, 0,
 278                        PKT_RX_RSS_HASH | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
 279                        PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
 280                        PKT_RX_RSS_HASH, 0,
 281                        /* 2nd 128-bits */
 282                        0, 0, 0, 0,
 283                        0, 0, 0, 0,
 284                        0, 0, 0, 0,
 285                        PKT_RX_RSS_HASH | PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
 286                        PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED,
 287                        PKT_RX_RSS_HASH, 0);
 288
 289        uint16_t i, received;
 290
 291        for (i = 0, received = 0; i < nb_pkts;
 292             i += ICE_DESCS_PER_LOOP_AVX,
 293             rxdp += ICE_DESCS_PER_LOOP_AVX) {
 294                /* step 1, copy over 8 mbuf pointers to rx_pkts array */
 295                _mm256_storeu_si256((void *)&rx_pkts[i],
 296                                    _mm256_loadu_si256((void *)&sw_ring[i]));
 297#ifdef RTE_ARCH_X86_64
 298                _mm256_storeu_si256
 299                        ((void *)&rx_pkts[i + 4],
 300                         _mm256_loadu_si256((void *)&sw_ring[i + 4]));
 301#endif
 302
 303                __m512i raw_desc0_3, raw_desc4_7;
 304                __m256i raw_desc0_1, raw_desc2_3, raw_desc4_5, raw_desc6_7;
 305
 306                /* load in descriptors, in reverse order */
 307                const __m128i raw_desc7 =
 308                        _mm_load_si128((void *)(rxdp + 7));
 309                rte_compiler_barrier();
 310                const __m128i raw_desc6 =
 311                        _mm_load_si128((void *)(rxdp + 6));
 312                rte_compiler_barrier();
 313                const __m128i raw_desc5 =
 314                        _mm_load_si128((void *)(rxdp + 5));
 315                rte_compiler_barrier();
 316                const __m128i raw_desc4 =
 317                        _mm_load_si128((void *)(rxdp + 4));
 318                rte_compiler_barrier();
 319                const __m128i raw_desc3 =
 320                        _mm_load_si128((void *)(rxdp + 3));
 321                rte_compiler_barrier();
 322                const __m128i raw_desc2 =
 323                        _mm_load_si128((void *)(rxdp + 2));
 324                rte_compiler_barrier();
 325                const __m128i raw_desc1 =
 326                        _mm_load_si128((void *)(rxdp + 1));
 327                rte_compiler_barrier();
 328                const __m128i raw_desc0 =
 329                        _mm_load_si128((void *)(rxdp + 0));
 330
 331                raw_desc6_7 =
 332                        _mm256_inserti128_si256
 333                                (_mm256_castsi128_si256(raw_desc6),
 334                                 raw_desc7, 1);
 335                raw_desc4_5 =
 336                        _mm256_inserti128_si256
 337                                (_mm256_castsi128_si256(raw_desc4),
 338                                 raw_desc5, 1);
 339                raw_desc2_3 =
 340                        _mm256_inserti128_si256
 341                                (_mm256_castsi128_si256(raw_desc2),
 342                                 raw_desc3, 1);
 343                raw_desc0_1 =
 344                        _mm256_inserti128_si256
 345                                (_mm256_castsi128_si256(raw_desc0),
 346                                 raw_desc1, 1);
 347
 348                raw_desc4_7 =
 349                        _mm512_inserti64x4
 350                                (_mm512_castsi256_si512(raw_desc4_5),
 351                                 raw_desc6_7, 1);
 352                raw_desc0_3 =
 353                        _mm512_inserti64x4
 354                                (_mm512_castsi256_si512(raw_desc0_1),
 355                                 raw_desc2_3, 1);
 356
 357                if (split_packet) {
 358                        int j;
 359
 360                        for (j = 0; j < ICE_DESCS_PER_LOOP_AVX; j++)
 361                                rte_mbuf_prefetch_part2(rx_pkts[i + j]);
 362                }
 363
 364                /**
 365                 * convert descriptors 0-7 into mbufs, re-arrange fields.
 366                 * Then write into the mbuf.
 367                 */
 368                __m512i mb4_7 = _mm512_shuffle_epi8(raw_desc4_7, shuf_msk);
 369                __m512i mb0_3 = _mm512_shuffle_epi8(raw_desc0_3, shuf_msk);
 370
 371                mb4_7 = _mm512_add_epi32(mb4_7, crc_adjust);
 372                mb0_3 = _mm512_add_epi32(mb0_3, crc_adjust);
 373
 374                /**
 375                 * to get packet types, ptype is located in bit16-25
 376                 * of each 128bits
 377                 */
 378                const __m512i ptype_mask =
 379                        _mm512_set1_epi16(ICE_RX_FLEX_DESC_PTYPE_M);
 380
 381                /**
 382                 * to get packet types, ptype is located in bit16-25
 383                 * of each 128bits
 384                 */
 385                const __m512i ptypes4_7 =
 386                        _mm512_and_si512(raw_desc4_7, ptype_mask);
 387                const __m512i ptypes0_3 =
 388                        _mm512_and_si512(raw_desc0_3, ptype_mask);
 389
 390                const __m256i ptypes6_7 =
 391                        _mm512_extracti64x4_epi64(ptypes4_7, 1);
 392                const __m256i ptypes4_5 =
 393                        _mm512_extracti64x4_epi64(ptypes4_7, 0);
 394                const __m256i ptypes2_3 =
 395                        _mm512_extracti64x4_epi64(ptypes0_3, 1);
 396                const __m256i ptypes0_1 =
 397                        _mm512_extracti64x4_epi64(ptypes0_3, 0);
 398                const uint16_t ptype7 = _mm256_extract_epi16(ptypes6_7, 9);
 399                const uint16_t ptype6 = _mm256_extract_epi16(ptypes6_7, 1);
 400                const uint16_t ptype5 = _mm256_extract_epi16(ptypes4_5, 9);
 401                const uint16_t ptype4 = _mm256_extract_epi16(ptypes4_5, 1);
 402                const uint16_t ptype3 = _mm256_extract_epi16(ptypes2_3, 9);
 403                const uint16_t ptype2 = _mm256_extract_epi16(ptypes2_3, 1);
 404                const uint16_t ptype1 = _mm256_extract_epi16(ptypes0_1, 9);
 405                const uint16_t ptype0 = _mm256_extract_epi16(ptypes0_1, 1);
 406
 407                const __m512i ptype4_7 = _mm512_set_epi32
 408                        (0, 0, 0, ptype_tbl[ptype7],
 409                         0, 0, 0, ptype_tbl[ptype6],
 410                         0, 0, 0, ptype_tbl[ptype5],
 411                         0, 0, 0, ptype_tbl[ptype4]);
 412                const __m512i ptype0_3 = _mm512_set_epi32
 413                        (0, 0, 0, ptype_tbl[ptype3],
 414                         0, 0, 0, ptype_tbl[ptype2],
 415                         0, 0, 0, ptype_tbl[ptype1],
 416                         0, 0, 0, ptype_tbl[ptype0]);
 417
 418                mb4_7 = _mm512_mask_blend_epi32(0x1111, mb4_7, ptype4_7);
 419                mb0_3 = _mm512_mask_blend_epi32(0x1111, mb0_3, ptype0_3);
 420
 421                __m256i mb4_5 = _mm512_extracti64x4_epi64(mb4_7, 0);
 422                __m256i mb6_7 = _mm512_extracti64x4_epi64(mb4_7, 1);
 423                __m256i mb0_1 = _mm512_extracti64x4_epi64(mb0_3, 0);
 424                __m256i mb2_3 = _mm512_extracti64x4_epi64(mb0_3, 1);
 425
 426                /**
 427                 * use permute/extract to get status content
 428                 * After the operations, the packets status flags are in the
 429                 * order (hi->lo): [1, 3, 5, 7, 0, 2, 4, 6]
 430                 */
 431                /* merge the status bits into one register */
 432                const __m512i status_permute_msk = _mm512_set_epi32
 433                        (0, 0, 0, 0,
 434                         0, 0, 0, 0,
 435                         22, 30, 6, 14,
 436                         18, 26, 2, 10);
 437                const __m512i raw_status0_7 = _mm512_permutex2var_epi32
 438                        (raw_desc4_7, status_permute_msk, raw_desc0_3);
 439                __m256i status0_7 = _mm512_extracti64x4_epi64
 440                        (raw_status0_7, 0);
 441
 442                /* now do flag manipulation */
 443
 444                /* get only flag/error bits we want */
 445                const __m256i flag_bits =
 446                        _mm256_and_si256(status0_7, flags_mask);
 447                /**
 448                 * l3_l4_error flags, shuffle, then shift to correct adjustment
 449                 * of flags in flags_shuf, and finally mask out extra bits
 450                 */
 451                __m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
 452                                _mm256_srli_epi32(flag_bits, 4));
 453                l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
 454                l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
 455                /* set rss and vlan flags */
 456                const __m256i rss_vlan_flag_bits =
 457                        _mm256_srli_epi32(flag_bits, 12);
 458                const __m256i rss_vlan_flags =
 459                        _mm256_shuffle_epi8(rss_vlan_flags_shuf,
 460                                            rss_vlan_flag_bits);
 461
 462                /* merge flags */
 463                __m256i mbuf_flags = _mm256_or_si256(l3_l4_flags,
 464                                                     rss_vlan_flags);
 465
 466                if (rxq->fdir_enabled) {
 467                        const __m256i fdir_id4_7 =
 468                                _mm256_unpackhi_epi32(raw_desc6_7, raw_desc4_5);
 469
 470                        const __m256i fdir_id0_3 =
 471                                _mm256_unpackhi_epi32(raw_desc2_3, raw_desc0_1);
 472
 473                        const __m256i fdir_id0_7 =
 474                                _mm256_unpackhi_epi64(fdir_id4_7, fdir_id0_3);
 475
 476                        const __m256i fdir_flags =
 477                                ice_flex_rxd_to_fdir_flags_vec_avx512
 478                                        (fdir_id0_7);
 479
 480                        /* merge with fdir_flags */
 481                        mbuf_flags = _mm256_or_si256(mbuf_flags, fdir_flags);
 482
 483                        /* write to mbuf: have to use scalar store here */
 484                        rx_pkts[i + 0]->hash.fdir.hi =
 485                                _mm256_extract_epi32(fdir_id0_7, 3);
 486
 487                        rx_pkts[i + 1]->hash.fdir.hi =
 488                                _mm256_extract_epi32(fdir_id0_7, 7);
 489
 490                        rx_pkts[i + 2]->hash.fdir.hi =
 491                                _mm256_extract_epi32(fdir_id0_7, 2);
 492
 493                        rx_pkts[i + 3]->hash.fdir.hi =
 494                                _mm256_extract_epi32(fdir_id0_7, 6);
 495
 496                        rx_pkts[i + 4]->hash.fdir.hi =
 497                                _mm256_extract_epi32(fdir_id0_7, 1);
 498
 499                        rx_pkts[i + 5]->hash.fdir.hi =
 500                                _mm256_extract_epi32(fdir_id0_7, 5);
 501
 502                        rx_pkts[i + 6]->hash.fdir.hi =
 503                                _mm256_extract_epi32(fdir_id0_7, 0);
 504
 505                        rx_pkts[i + 7]->hash.fdir.hi =
 506                                _mm256_extract_epi32(fdir_id0_7, 4);
 507                } /* if() on fdir_enabled */
 508
 509#ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC
 510                /**
 511                 * needs to load 2nd 16B of each desc for RSS hash parsing,
 512                 * will cause performance drop to get into this context.
 513                 */
 514                if (rxq->vsi->adapter->eth_dev->data->dev_conf.rxmode.offloads &
 515                                DEV_RX_OFFLOAD_RSS_HASH) {
 516                        /* load bottom half of every 32B desc */
 517                        const __m128i raw_desc_bh7 =
 518                                _mm_load_si128
 519                                        ((void *)(&rxdp[7].wb.status_error1));
 520                        rte_compiler_barrier();
 521                        const __m128i raw_desc_bh6 =
 522                                _mm_load_si128
 523                                        ((void *)(&rxdp[6].wb.status_error1));
 524                        rte_compiler_barrier();
 525                        const __m128i raw_desc_bh5 =
 526                                _mm_load_si128
 527                                        ((void *)(&rxdp[5].wb.status_error1));
 528                        rte_compiler_barrier();
 529                        const __m128i raw_desc_bh4 =
 530                                _mm_load_si128
 531                                        ((void *)(&rxdp[4].wb.status_error1));
 532                        rte_compiler_barrier();
 533                        const __m128i raw_desc_bh3 =
 534                                _mm_load_si128
 535                                        ((void *)(&rxdp[3].wb.status_error1));
 536                        rte_compiler_barrier();
 537                        const __m128i raw_desc_bh2 =
 538                                _mm_load_si128
 539                                        ((void *)(&rxdp[2].wb.status_error1));
 540                        rte_compiler_barrier();
 541                        const __m128i raw_desc_bh1 =
 542                                _mm_load_si128
 543                                        ((void *)(&rxdp[1].wb.status_error1));
 544                        rte_compiler_barrier();
 545                        const __m128i raw_desc_bh0 =
 546                                _mm_load_si128
 547                                        ((void *)(&rxdp[0].wb.status_error1));
 548
 549                        __m256i raw_desc_bh6_7 =
 550                                _mm256_inserti128_si256
 551                                        (_mm256_castsi128_si256(raw_desc_bh6),
 552                                        raw_desc_bh7, 1);
 553                        __m256i raw_desc_bh4_5 =
 554                                _mm256_inserti128_si256
 555                                        (_mm256_castsi128_si256(raw_desc_bh4),
 556                                        raw_desc_bh5, 1);
 557                        __m256i raw_desc_bh2_3 =
 558                                _mm256_inserti128_si256
 559                                        (_mm256_castsi128_si256(raw_desc_bh2),
 560                                        raw_desc_bh3, 1);
 561                        __m256i raw_desc_bh0_1 =
 562                                _mm256_inserti128_si256
 563                                        (_mm256_castsi128_si256(raw_desc_bh0),
 564                                        raw_desc_bh1, 1);
 565
 566                        /**
 567                         * to shift the 32b RSS hash value to the
 568                         * highest 32b of each 128b before mask
 569                         */
 570                        __m256i rss_hash6_7 =
 571                                _mm256_slli_epi64(raw_desc_bh6_7, 32);
 572                        __m256i rss_hash4_5 =
 573                                _mm256_slli_epi64(raw_desc_bh4_5, 32);
 574                        __m256i rss_hash2_3 =
 575                                _mm256_slli_epi64(raw_desc_bh2_3, 32);
 576                        __m256i rss_hash0_1 =
 577                                _mm256_slli_epi64(raw_desc_bh0_1, 32);
 578
 579                        __m256i rss_hash_msk =
 580                                _mm256_set_epi32(0xFFFFFFFF, 0, 0, 0,
 581                                                 0xFFFFFFFF, 0, 0, 0);
 582
 583                        rss_hash6_7 = _mm256_and_si256
 584                                        (rss_hash6_7, rss_hash_msk);
 585                        rss_hash4_5 = _mm256_and_si256
 586                                        (rss_hash4_5, rss_hash_msk);
 587                        rss_hash2_3 = _mm256_and_si256
 588                                        (rss_hash2_3, rss_hash_msk);
 589                        rss_hash0_1 = _mm256_and_si256
 590                                        (rss_hash0_1, rss_hash_msk);
 591
 592                        mb6_7 = _mm256_or_si256(mb6_7, rss_hash6_7);
 593                        mb4_5 = _mm256_or_si256(mb4_5, rss_hash4_5);
 594                        mb2_3 = _mm256_or_si256(mb2_3, rss_hash2_3);
 595                        mb0_1 = _mm256_or_si256(mb0_1, rss_hash0_1);
 596                } /* if() on RSS hash parsing */
 597#endif
 598
 599                /**
 600                 * At this point, we have the 8 sets of flags in the low 16-bits
 601                 * of each 32-bit value in vlan0.
 602                 * We want to extract these, and merge them with the mbuf init
 603                 * data so we can do a single write to the mbuf to set the flags
 604                 * and all the other initialization fields. Extracting the
 605                 * appropriate flags means that we have to do a shift and blend
 606                 * for each mbuf before we do the write. However, we can also
 607                 * add in the previously computed rx_descriptor fields to
 608                 * make a single 256-bit write per mbuf
 609                 */
 610                /* check the structure matches expectations */
 611                RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
 612                                 offsetof(struct rte_mbuf, rearm_data) + 8);
 613                RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
 614                                 RTE_ALIGN(offsetof(struct rte_mbuf,
 615                                                    rearm_data),
 616                                           16));
 617                /* build up data and do writes */
 618                __m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5,
 619                        rearm6, rearm7;
 620
 621                rearm6 = _mm256_blend_epi32(mbuf_init,
 622                                            _mm256_slli_si256(mbuf_flags, 8),
 623                                            0x04);
 624                rearm4 = _mm256_blend_epi32(mbuf_init,
 625                                            _mm256_slli_si256(mbuf_flags, 4),
 626                                            0x04);
 627                rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04);
 628                rearm0 = _mm256_blend_epi32(mbuf_init,
 629                                            _mm256_srli_si256(mbuf_flags, 4),
 630                                            0x04);
 631
 632                /* permute to add in the rx_descriptor e.g. rss fields */
 633                rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20);
 634                rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20);
 635                rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20);
 636                rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20);
 637
 638                /* write to mbuf */
 639                _mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data,
 640                                    rearm6);
 641                _mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data,
 642                                    rearm4);
 643                _mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data,
 644                                    rearm2);
 645                _mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data,
 646                                    rearm0);
 647
 648                /* repeat for the odd mbufs */
 649                const __m256i odd_flags =
 650                        _mm256_castsi128_si256
 651                                (_mm256_extracti128_si256(mbuf_flags, 1));
 652                rearm7 = _mm256_blend_epi32(mbuf_init,
 653                                            _mm256_slli_si256(odd_flags, 8),
 654                                            0x04);
 655                rearm5 = _mm256_blend_epi32(mbuf_init,
 656                                            _mm256_slli_si256(odd_flags, 4),
 657                                            0x04);
 658                rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04);
 659                rearm1 = _mm256_blend_epi32(mbuf_init,
 660                                            _mm256_srli_si256(odd_flags, 4),
 661                                            0x04);
 662
 663                /* since odd mbufs are already in hi 128-bits use blend */
 664                rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0);
 665                rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0);
 666                rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0);
 667                rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0);
 668                /* again write to mbufs */
 669                _mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data,
 670                                    rearm7);
 671                _mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data,
 672                                    rearm5);
 673                _mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data,
 674                                    rearm3);
 675                _mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data,
 676                                    rearm1);
 677
 678                /* extract and record EOP bit */
 679                if (split_packet) {
 680                        const __m128i eop_mask =
 681                                _mm_set1_epi16(1 << ICE_RX_DESC_STATUS_EOF_S);
 682                        const __m256i eop_bits256 = _mm256_and_si256(status0_7,
 683                                                                     eop_check);
 684                        /* pack status bits into a single 128-bit register */
 685                        const __m128i eop_bits =
 686                                _mm_packus_epi32
 687                                        (_mm256_castsi256_si128(eop_bits256),
 688                                         _mm256_extractf128_si256(eop_bits256,
 689                                                                  1));
 690                        /**
 691                         * flip bits, and mask out the EOP bit, which is now
 692                         * a split-packet bit i.e. !EOP, rather than EOP one.
 693                         */
 694                        __m128i split_bits = _mm_andnot_si128(eop_bits,
 695                                        eop_mask);
 696                        /**
 697                         * eop bits are out of order, so we need to shuffle them
 698                         * back into order again. In doing so, only use low 8
 699                         * bits, which acts like another pack instruction
 700                         * The original order is (hi->lo): 1,3,5,7,0,2,4,6
 701                         * [Since we use epi8, the 16-bit positions are
 702                         * multiplied by 2 in the eop_shuffle value.]
 703                         */
 704                        __m128i eop_shuffle =
 705                                _mm_set_epi8(/* zero hi 64b */
 706                                             0xFF, 0xFF, 0xFF, 0xFF,
 707                                             0xFF, 0xFF, 0xFF, 0xFF,
 708                                             /* move values to lo 64b */
 709                                             8, 0, 10, 2,
 710                                             12, 4, 14, 6);
 711                        split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle);
 712                        *(uint64_t *)split_packet =
 713                                _mm_cvtsi128_si64(split_bits);
 714                        split_packet += ICE_DESCS_PER_LOOP_AVX;
 715                }
 716
 717                /* perform dd_check */
 718                status0_7 = _mm256_and_si256(status0_7, dd_check);
 719                status0_7 = _mm256_packs_epi32(status0_7,
 720                                               _mm256_setzero_si256());
 721
 722                uint64_t burst = __builtin_popcountll
 723                                        (_mm_cvtsi128_si64
 724                                                (_mm256_extracti128_si256
 725                                                        (status0_7, 1)));
 726                burst += __builtin_popcountll
 727                                (_mm_cvtsi128_si64
 728                                        (_mm256_castsi256_si128(status0_7)));
 729                received += burst;
 730                if (burst != ICE_DESCS_PER_LOOP_AVX)
 731                        break;
 732        }
 733
 734        /* update tail pointers */
 735        rxq->rx_tail += received;
 736        rxq->rx_tail &= (rxq->nb_rx_desc - 1);
 737        if ((rxq->rx_tail & 1) == 1 && received > 1) { /* keep avx2 aligned */
 738                rxq->rx_tail--;
 739                received--;
 740        }
 741        rxq->rxrearm_nb += received;
 742        return received;
 743}
 744
 745/**
 746 * Notice:
 747 * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
 748 */
 749uint16_t
 750ice_recv_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
 751                         uint16_t nb_pkts)
 752{
 753        return _ice_recv_raw_pkts_vec_avx512(rx_queue, rx_pkts, nb_pkts, NULL);
 754}
 755
 756/**
 757 * vPMD receive routine that reassembles single burst of 32 scattered packets
 758 * Notice:
 759 * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
 760 */
 761static uint16_t
 762ice_recv_scattered_burst_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
 763                                    uint16_t nb_pkts)
 764{
 765        struct ice_rx_queue *rxq = rx_queue;
 766        uint8_t split_flags[ICE_VPMD_RX_BURST] = {0};
 767
 768        /* get some new buffers */
 769        uint16_t nb_bufs = _ice_recv_raw_pkts_vec_avx512(rxq, rx_pkts, nb_pkts,
 770                                                       split_flags);
 771        if (nb_bufs == 0)
 772                return 0;
 773
 774        /* happy day case, full burst + no packets to be joined */
 775        const uint64_t *split_fl64 = (uint64_t *)split_flags;
 776
 777        if (!rxq->pkt_first_seg &&
 778            split_fl64[0] == 0 && split_fl64[1] == 0 &&
 779            split_fl64[2] == 0 && split_fl64[3] == 0)
 780                return nb_bufs;
 781
 782        /* reassemble any packets that need reassembly */
 783        unsigned int i = 0;
 784
 785        if (!rxq->pkt_first_seg) {
 786                /* find the first split flag, and only reassemble then */
 787                while (i < nb_bufs && !split_flags[i])
 788                        i++;
 789                if (i == nb_bufs)
 790                        return nb_bufs;
 791                rxq->pkt_first_seg = rx_pkts[i];
 792        }
 793        return i + ice_rx_reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i,
 794                                             &split_flags[i]);
 795}
 796
 797/**
 798 * vPMD receive routine that reassembles scattered packets.
 799 * Main receive routine that can handle arbitrary burst sizes
 800 * Notice:
 801 * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
 802 */
 803uint16_t
 804ice_recv_scattered_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
 805                                   uint16_t nb_pkts)
 806{
 807        uint16_t retval = 0;
 808
 809        while (nb_pkts > ICE_VPMD_RX_BURST) {
 810                uint16_t burst = ice_recv_scattered_burst_vec_avx512(rx_queue,
 811                                rx_pkts + retval, ICE_VPMD_RX_BURST);
 812                retval += burst;
 813                nb_pkts -= burst;
 814                if (burst < ICE_VPMD_RX_BURST)
 815                        return retval;
 816        }
 817        return retval + ice_recv_scattered_burst_vec_avx512(rx_queue,
 818                                rx_pkts + retval, nb_pkts);
 819}
 820
 821static __rte_always_inline int
 822ice_tx_free_bufs_avx512(struct ice_tx_queue *txq)
 823{
 824        struct ice_vec_tx_entry *txep;
 825        uint32_t n;
 826        uint32_t i;
 827        int nb_free = 0;
 828        struct rte_mbuf *m, *free[ICE_TX_MAX_FREE_BUF_SZ];
 829
 830        /* check DD bits on threshold descriptor */
 831        if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz &
 832                        rte_cpu_to_le_64(ICE_TXD_QW1_DTYPE_M)) !=
 833                        rte_cpu_to_le_64(ICE_TX_DESC_DTYPE_DESC_DONE))
 834                return 0;
 835
 836        n = txq->tx_rs_thresh;
 837
 838        /* first buffer to free from S/W ring is at index
 839         * tx_next_dd - (tx_rs_thresh - 1)
 840         */
 841        txep = (void *)txq->sw_ring;
 842        txep += txq->tx_next_dd - (n - 1);
 843
 844        if (txq->offloads & DEV_TX_OFFLOAD_MBUF_FAST_FREE && (n & 31) == 0) {
 845                struct rte_mempool *mp = txep[0].mbuf->pool;
 846                void **cache_objs;
 847                struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
 848                                rte_lcore_id());
 849
 850                if (!cache || cache->len == 0)
 851                        goto normal;
 852
 853                cache_objs = &cache->objs[cache->len];
 854
 855                if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
 856                        rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
 857                        goto done;
 858                }
 859
 860                /* The cache follows the following algorithm
 861                 *   1. Add the objects to the cache
 862                 *   2. Anything greater than the cache min value (if it
 863                 *   crosses the cache flush threshold) is flushed to the ring.
 864                 */
 865                /* Add elements back into the cache */
 866                uint32_t copied = 0;
 867                /* n is multiple of 32 */
 868                while (copied < n) {
 869                        const __m512i a = _mm512_loadu_si512(&txep[copied]);
 870                        const __m512i b = _mm512_loadu_si512(&txep[copied + 8]);
 871                        const __m512i c = _mm512_loadu_si512(&txep[copied + 16]);
 872                        const __m512i d = _mm512_loadu_si512(&txep[copied + 24]);
 873
 874                        _mm512_storeu_si512(&cache_objs[copied], a);
 875                        _mm512_storeu_si512(&cache_objs[copied + 8], b);
 876                        _mm512_storeu_si512(&cache_objs[copied + 16], c);
 877                        _mm512_storeu_si512(&cache_objs[copied + 24], d);
 878                        copied += 32;
 879                }
 880                cache->len += n;
 881
 882                if (cache->len >= cache->flushthresh) {
 883                        rte_mempool_ops_enqueue_bulk
 884                                (mp, &cache->objs[cache->size],
 885                                 cache->len - cache->size);
 886                        cache->len = cache->size;
 887                }
 888                goto done;
 889        }
 890
 891normal:
 892        m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
 893        if (likely(m)) {
 894                free[0] = m;
 895                nb_free = 1;
 896                for (i = 1; i < n; i++) {
 897                        m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
 898                        if (likely(m)) {
 899                                if (likely(m->pool == free[0]->pool)) {
 900                                        free[nb_free++] = m;
 901                                } else {
 902                                        rte_mempool_put_bulk(free[0]->pool,
 903                                                             (void *)free,
 904                                                             nb_free);
 905                                        free[0] = m;
 906                                        nb_free = 1;
 907                                }
 908                        }
 909                }
 910                rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
 911        } else {
 912                for (i = 1; i < n; i++) {
 913                        m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
 914                        if (m)
 915                                rte_mempool_put(m->pool, m);
 916                }
 917        }
 918
 919done:
 920        /* buffers were freed, update counters */
 921        txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
 922        txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
 923        if (txq->tx_next_dd >= txq->nb_tx_desc)
 924                txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
 925
 926        return txq->tx_rs_thresh;
 927}
 928
 929static inline void
 930ice_vtx1(volatile struct ice_tx_desc *txdp,
 931         struct rte_mbuf *pkt, uint64_t flags)
 932{
 933        uint64_t high_qw =
 934                (ICE_TX_DESC_DTYPE_DATA |
 935                 ((uint64_t)flags  << ICE_TXD_QW1_CMD_S) |
 936                 ((uint64_t)pkt->data_len << ICE_TXD_QW1_TX_BUF_SZ_S));
 937
 938        __m128i descriptor = _mm_set_epi64x(high_qw,
 939                                pkt->buf_iova + pkt->data_off);
 940        _mm_store_si128((__m128i *)txdp, descriptor);
 941}
 942
 943static inline void
 944ice_vtx(volatile struct ice_tx_desc *txdp,
 945        struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags)
 946{
 947        const uint64_t hi_qw_tmpl = (ICE_TX_DESC_DTYPE_DATA |
 948                        ((uint64_t)flags  << ICE_TXD_QW1_CMD_S));
 949
 950        for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {
 951                uint64_t hi_qw3 =
 952                        hi_qw_tmpl |
 953                        ((uint64_t)pkt[3]->data_len <<
 954                         ICE_TXD_QW1_TX_BUF_SZ_S);
 955                uint64_t hi_qw2 =
 956                        hi_qw_tmpl |
 957                        ((uint64_t)pkt[2]->data_len <<
 958                         ICE_TXD_QW1_TX_BUF_SZ_S);
 959                uint64_t hi_qw1 =
 960                        hi_qw_tmpl |
 961                        ((uint64_t)pkt[1]->data_len <<
 962                         ICE_TXD_QW1_TX_BUF_SZ_S);
 963                uint64_t hi_qw0 =
 964                        hi_qw_tmpl |
 965                        ((uint64_t)pkt[0]->data_len <<
 966                         ICE_TXD_QW1_TX_BUF_SZ_S);
 967
 968                __m512i desc0_3 =
 969                        _mm512_set_epi64
 970                                (hi_qw3,
 971                                 pkt[3]->buf_iova + pkt[3]->data_off,
 972                                 hi_qw2,
 973                                 pkt[2]->buf_iova + pkt[2]->data_off,
 974                                 hi_qw1,
 975                                 pkt[1]->buf_iova + pkt[1]->data_off,
 976                                 hi_qw0,
 977                                 pkt[0]->buf_iova + pkt[0]->data_off);
 978                _mm512_storeu_si512((void *)txdp, desc0_3);
 979        }
 980
 981        /* do any last ones */
 982        while (nb_pkts) {
 983                ice_vtx1(txdp, *pkt, flags);
 984                txdp++, pkt++, nb_pkts--;
 985        }
 986}
 987
 988static __rte_always_inline void
 989ice_tx_backlog_entry_avx512(struct ice_vec_tx_entry *txep,
 990                            struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
 991{
 992        int i;
 993
 994        for (i = 0; i < (int)nb_pkts; ++i)
 995                txep[i].mbuf = tx_pkts[i];
 996}
 997
 998static inline uint16_t
 999ice_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
1000                                uint16_t nb_pkts)
1001{
1002        struct ice_tx_queue *txq = (struct ice_tx_queue *)tx_queue;
1003        volatile struct ice_tx_desc *txdp;
1004        struct ice_vec_tx_entry *txep;
1005        uint16_t n, nb_commit, tx_id;
1006        uint64_t flags = ICE_TD_CMD;
1007        uint64_t rs = ICE_TX_DESC_CMD_RS | ICE_TD_CMD;
1008
1009        /* cross rx_thresh boundary is not allowed */
1010        nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);
1011
1012        if (txq->nb_tx_free < txq->tx_free_thresh)
1013                ice_tx_free_bufs_avx512(txq);
1014
1015        nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
1016        if (unlikely(nb_pkts == 0))
1017                return 0;
1018
1019        tx_id = txq->tx_tail;
1020        txdp = &txq->tx_ring[tx_id];
1021        txep = (void *)txq->sw_ring;
1022        txep += tx_id;
1023
1024        txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
1025
1026        n = (uint16_t)(txq->nb_tx_desc - tx_id);
1027        if (nb_commit >= n) {
1028                ice_tx_backlog_entry_avx512(txep, tx_pkts, n);
1029
1030                ice_vtx(txdp, tx_pkts, n - 1, flags);
1031                tx_pkts += (n - 1);
1032                txdp += (n - 1);
1033
1034                ice_vtx1(txdp, *tx_pkts++, rs);
1035
1036                nb_commit = (uint16_t)(nb_commit - n);
1037
1038                tx_id = 0;
1039                txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
1040
1041                /* avoid reach the end of ring */
1042                txdp = txq->tx_ring;
1043                txep = (void *)txq->sw_ring;
1044        }
1045
1046        ice_tx_backlog_entry_avx512(txep, tx_pkts, nb_commit);
1047
1048        ice_vtx(txdp, tx_pkts, nb_commit, flags);
1049
1050        tx_id = (uint16_t)(tx_id + nb_commit);
1051        if (tx_id > txq->tx_next_rs) {
1052                txq->tx_ring[txq->tx_next_rs].cmd_type_offset_bsz |=
1053                        rte_cpu_to_le_64(((uint64_t)ICE_TX_DESC_CMD_RS) <<
1054                                         ICE_TXD_QW1_CMD_S);
1055                txq->tx_next_rs =
1056                        (uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh);
1057        }
1058
1059        txq->tx_tail = tx_id;
1060
1061        ICE_PCI_REG_WRITE(txq->qtx_tail, txq->tx_tail);
1062
1063        return nb_pkts;
1064}
1065
1066uint16_t
1067ice_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
1068                         uint16_t nb_pkts)
1069{
1070        uint16_t nb_tx = 0;
1071        struct ice_tx_queue *txq = (struct ice_tx_queue *)tx_queue;
1072
1073        while (nb_pkts) {
1074                uint16_t ret, num;
1075
1076                num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh);
1077                ret = ice_xmit_fixed_burst_vec_avx512(tx_queue,
1078                                                      &tx_pkts[nb_tx], num);
1079                nb_tx += ret;
1080                nb_pkts -= ret;
1081                if (ret < num)
1082                        break;
1083        }
1084
1085        return nb_tx;
1086}
1087