dpdk/drivers/net/ice/ice_rxtx_vec_sse.c
<<
>>
Prefs
   1/* SPDX-License-Identifier: BSD-3-Clause
   2 * Copyright(c) 2019 Intel Corporation
   3 */
   4
   5#include "ice_rxtx_vec_common.h"
   6
   7#include <tmmintrin.h>
   8
   9#ifndef __INTEL_COMPILER
  10#pragma GCC diagnostic ignored "-Wcast-qual"
  11#endif
  12
  13static inline __m128i
  14ice_flex_rxd_to_fdir_flags_vec(const __m128i fdir_id0_3)
  15{
  16#define FDID_MIS_MAGIC 0xFFFFFFFF
  17        RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR != (1 << 2));
  18        RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR_ID != (1 << 13));
  19        const __m128i pkt_fdir_bit = _mm_set1_epi32(RTE_MBUF_F_RX_FDIR |
  20                        RTE_MBUF_F_RX_FDIR_ID);
  21        /* desc->flow_id field == 0xFFFFFFFF means fdir mismatch */
  22        const __m128i fdir_mis_mask = _mm_set1_epi32(FDID_MIS_MAGIC);
  23        __m128i fdir_mask = _mm_cmpeq_epi32(fdir_id0_3,
  24                        fdir_mis_mask);
  25        /* this XOR op results to bit-reverse the fdir_mask */
  26        fdir_mask = _mm_xor_si128(fdir_mask, fdir_mis_mask);
  27        const __m128i fdir_flags = _mm_and_si128(fdir_mask, pkt_fdir_bit);
  28
  29        return fdir_flags;
  30}
  31
  32static inline void
  33ice_rxq_rearm(struct ice_rx_queue *rxq)
  34{
  35        int i;
  36        uint16_t rx_id;
  37        volatile union ice_rx_flex_desc *rxdp;
  38        struct ice_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];
  39        struct rte_mbuf *mb0, *mb1;
  40        __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
  41                                          RTE_PKTMBUF_HEADROOM);
  42        __m128i dma_addr0, dma_addr1;
  43
  44        rxdp = rxq->rx_ring + rxq->rxrearm_start;
  45
  46        /* Pull 'n' more MBUFs into the software ring */
  47        if (rte_mempool_get_bulk(rxq->mp,
  48                                 (void *)rxep,
  49                                 ICE_RXQ_REARM_THRESH) < 0) {
  50                if (rxq->rxrearm_nb + ICE_RXQ_REARM_THRESH >=
  51                    rxq->nb_rx_desc) {
  52                        dma_addr0 = _mm_setzero_si128();
  53                        for (i = 0; i < ICE_DESCS_PER_LOOP; i++) {
  54                                rxep[i].mbuf = &rxq->fake_mbuf;
  55                                _mm_store_si128((__m128i *)&rxdp[i].read,
  56                                                dma_addr0);
  57                        }
  58                }
  59                rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
  60                        ICE_RXQ_REARM_THRESH;
  61                return;
  62        }
  63
  64        /* Initialize the mbufs in vector, process 2 mbufs in one loop */
  65        for (i = 0; i < ICE_RXQ_REARM_THRESH; i += 2, rxep += 2) {
  66                __m128i vaddr0, vaddr1;
  67
  68                mb0 = rxep[0].mbuf;
  69                mb1 = rxep[1].mbuf;
  70
  71                /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
  72                RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
  73                                 offsetof(struct rte_mbuf, buf_addr) + 8);
  74                vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
  75                vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
  76
  77                /* convert pa to dma_addr hdr/data */
  78                dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
  79                dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
  80
  81                /* add headroom to pa values */
  82                dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
  83                dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
  84
  85                /* flush desc with pa dma_addr */
  86                _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
  87                _mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
  88        }
  89
  90        rxq->rxrearm_start += ICE_RXQ_REARM_THRESH;
  91        if (rxq->rxrearm_start >= rxq->nb_rx_desc)
  92                rxq->rxrearm_start = 0;
  93
  94        rxq->rxrearm_nb -= ICE_RXQ_REARM_THRESH;
  95
  96        rx_id = (uint16_t)((rxq->rxrearm_start == 0) ?
  97                           (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
  98
  99        /* Update the tail pointer on the NIC */
 100        ICE_PCI_REG_WC_WRITE(rxq->qrx_tail, rx_id);
 101}
 102
 103static inline void
 104ice_rx_desc_to_olflags_v(struct ice_rx_queue *rxq, __m128i descs[4],
 105                         struct rte_mbuf **rx_pkts)
 106{
 107        const __m128i mbuf_init = _mm_set_epi64x(0, rxq->mbuf_initializer);
 108        __m128i rearm0, rearm1, rearm2, rearm3;
 109
 110        __m128i tmp_desc, flags, rss_vlan;
 111
 112        /* mask everything except checksum, RSS and VLAN flags.
 113         * bit6:4 for checksum.
 114         * bit12 for RSS indication.
 115         * bit13 for VLAN indication.
 116         */
 117        const __m128i desc_mask = _mm_set_epi32(0x30f0, 0x30f0,
 118                                                0x30f0, 0x30f0);
 119        const __m128i cksum_mask = _mm_set_epi32(RTE_MBUF_F_RX_IP_CKSUM_MASK |
 120                                                 RTE_MBUF_F_RX_L4_CKSUM_MASK |
 121                                                 RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK |
 122                                                 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD,
 123                                                 RTE_MBUF_F_RX_IP_CKSUM_MASK |
 124                                                 RTE_MBUF_F_RX_L4_CKSUM_MASK |
 125                                                 RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK |
 126                                                 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD,
 127                                                 RTE_MBUF_F_RX_IP_CKSUM_MASK |
 128                                                 RTE_MBUF_F_RX_L4_CKSUM_MASK |
 129                                                 RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK |
 130                                                 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD,
 131                                                 RTE_MBUF_F_RX_IP_CKSUM_MASK |
 132                                                 RTE_MBUF_F_RX_L4_CKSUM_MASK |
 133                                                 RTE_MBUF_F_RX_OUTER_L4_CKSUM_MASK |
 134                                                 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD);
 135
 136        /* map the checksum, rss and vlan fields to the checksum, rss
 137         * and vlan flag
 138         */
 139        const __m128i cksum_flags =
 140                _mm_set_epi8((RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 |
 141                 RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD |
 142                  RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
 143                (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
 144                 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
 145                (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
 146                 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
 147                (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
 148                 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
 149                (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD |
 150                 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
 151                (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD |
 152                 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
 153                (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
 154                 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
 155                (RTE_MBUF_F_RX_OUTER_L4_CKSUM_BAD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
 156                 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
 157                /**
 158                 * shift right 20 bits to use the low two bits to indicate
 159                 * outer checksum status
 160                 * shift right 1 bit to make sure it not exceed 255
 161                 */
 162                (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
 163                 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
 164                (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
 165                 RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
 166                (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
 167                 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
 168                (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
 169                 RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
 170                (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD |
 171                 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
 172                (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_BAD |
 173                 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1,
 174                (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
 175                 RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
 176                (RTE_MBUF_F_RX_OUTER_L4_CKSUM_GOOD >> 20 | RTE_MBUF_F_RX_L4_CKSUM_GOOD |
 177                 RTE_MBUF_F_RX_IP_CKSUM_GOOD) >> 1);
 178
 179        const __m128i rss_vlan_flags = _mm_set_epi8(0, 0, 0, 0,
 180                        0, 0, 0, 0,
 181                        0, 0, 0, 0,
 182                        RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED,
 183                        RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED,
 184                        RTE_MBUF_F_RX_RSS_HASH, 0);
 185
 186        /* merge 4 descriptors */
 187        flags = _mm_unpackhi_epi32(descs[0], descs[1]);
 188        tmp_desc = _mm_unpackhi_epi32(descs[2], descs[3]);
 189        tmp_desc = _mm_unpacklo_epi64(flags, tmp_desc);
 190        tmp_desc = _mm_and_si128(tmp_desc, desc_mask);
 191
 192        /* checksum flags */
 193        tmp_desc = _mm_srli_epi32(tmp_desc, 4);
 194        flags = _mm_shuffle_epi8(cksum_flags, tmp_desc);
 195        /* then we shift left 1 bit */
 196        flags = _mm_slli_epi32(flags, 1);
 197
 198        __m128i l4_outer_mask = _mm_set_epi32(0x6, 0x6, 0x6, 0x6);
 199        __m128i l4_outer_flags = _mm_and_si128(flags, l4_outer_mask);
 200        l4_outer_flags = _mm_slli_epi32(l4_outer_flags, 20);
 201
 202        __m128i l3_l4_mask = _mm_set_epi32(~0x6, ~0x6, ~0x6, ~0x6);
 203        __m128i l3_l4_flags = _mm_and_si128(flags, l3_l4_mask);
 204        flags = _mm_or_si128(l3_l4_flags, l4_outer_flags);
 205        /* we need to mask out the redundant bits introduced by RSS or
 206         * VLAN fields.
 207         */
 208        flags = _mm_and_si128(flags, cksum_mask);
 209
 210        /* RSS, VLAN flag */
 211        tmp_desc = _mm_srli_epi32(tmp_desc, 8);
 212        rss_vlan = _mm_shuffle_epi8(rss_vlan_flags, tmp_desc);
 213
 214        /* merge the flags */
 215        flags = _mm_or_si128(flags, rss_vlan);
 216
 217        if (rxq->fdir_enabled) {
 218                const __m128i fdir_id0_1 =
 219                        _mm_unpackhi_epi32(descs[0], descs[1]);
 220
 221                const __m128i fdir_id2_3 =
 222                        _mm_unpackhi_epi32(descs[2], descs[3]);
 223
 224                const __m128i fdir_id0_3 =
 225                        _mm_unpackhi_epi64(fdir_id0_1, fdir_id2_3);
 226
 227                const __m128i fdir_flags =
 228                        ice_flex_rxd_to_fdir_flags_vec(fdir_id0_3);
 229
 230                /* merge with fdir_flags */
 231                flags = _mm_or_si128(flags, fdir_flags);
 232
 233                /* write fdir_id to mbuf */
 234                rx_pkts[0]->hash.fdir.hi =
 235                        _mm_extract_epi32(fdir_id0_3, 0);
 236
 237                rx_pkts[1]->hash.fdir.hi =
 238                        _mm_extract_epi32(fdir_id0_3, 1);
 239
 240                rx_pkts[2]->hash.fdir.hi =
 241                        _mm_extract_epi32(fdir_id0_3, 2);
 242
 243                rx_pkts[3]->hash.fdir.hi =
 244                        _mm_extract_epi32(fdir_id0_3, 3);
 245        } /* if() on fdir_enabled */
 246
 247        /**
 248         * At this point, we have the 4 sets of flags in the low 16-bits
 249         * of each 32-bit value in flags.
 250         * We want to extract these, and merge them with the mbuf init data
 251         * so we can do a single 16-byte write to the mbuf to set the flags
 252         * and all the other initialization fields. Extracting the
 253         * appropriate flags means that we have to do a shift and blend for
 254         * each mbuf before we do the write.
 255         */
 256        rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(flags, 8), 0x30);
 257        rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(flags, 4), 0x30);
 258        rearm2 = _mm_blend_epi16(mbuf_init, flags, 0x30);
 259        rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(flags, 4), 0x30);
 260
 261        /* write the rearm data and the olflags in one write */
 262        RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
 263                         offsetof(struct rte_mbuf, rearm_data) + 8);
 264        RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
 265                         RTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16));
 266        _mm_store_si128((__m128i *)&rx_pkts[0]->rearm_data, rearm0);
 267        _mm_store_si128((__m128i *)&rx_pkts[1]->rearm_data, rearm1);
 268        _mm_store_si128((__m128i *)&rx_pkts[2]->rearm_data, rearm2);
 269        _mm_store_si128((__m128i *)&rx_pkts[3]->rearm_data, rearm3);
 270}
 271
 272static inline void
 273ice_rx_desc_to_ptype_v(__m128i descs[4], struct rte_mbuf **rx_pkts,
 274                       uint32_t *ptype_tbl)
 275{
 276        const __m128i ptype_mask = _mm_set_epi16(ICE_RX_FLEX_DESC_PTYPE_M, 0,
 277                                                 ICE_RX_FLEX_DESC_PTYPE_M, 0,
 278                                                 ICE_RX_FLEX_DESC_PTYPE_M, 0,
 279                                                 ICE_RX_FLEX_DESC_PTYPE_M, 0);
 280        __m128i ptype_01 = _mm_unpacklo_epi32(descs[0], descs[1]);
 281        __m128i ptype_23 = _mm_unpacklo_epi32(descs[2], descs[3]);
 282        __m128i ptype_all = _mm_unpacklo_epi64(ptype_01, ptype_23);
 283
 284        ptype_all = _mm_and_si128(ptype_all, ptype_mask);
 285
 286        rx_pkts[0]->packet_type = ptype_tbl[_mm_extract_epi16(ptype_all, 1)];
 287        rx_pkts[1]->packet_type = ptype_tbl[_mm_extract_epi16(ptype_all, 3)];
 288        rx_pkts[2]->packet_type = ptype_tbl[_mm_extract_epi16(ptype_all, 5)];
 289        rx_pkts[3]->packet_type = ptype_tbl[_mm_extract_epi16(ptype_all, 7)];
 290}
 291
 292/**
 293 * vPMD raw receive routine, only accept(nb_pkts >= ICE_DESCS_PER_LOOP)
 294 *
 295 * Notice:
 296 * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
 297 * - floor align nb_pkts to a ICE_DESCS_PER_LOOP power-of-two
 298 */
 299static inline uint16_t
 300_ice_recv_raw_pkts_vec(struct ice_rx_queue *rxq, struct rte_mbuf **rx_pkts,
 301                       uint16_t nb_pkts, uint8_t *split_packet)
 302{
 303        volatile union ice_rx_flex_desc *rxdp;
 304        struct ice_rx_entry *sw_ring;
 305        uint16_t nb_pkts_recd;
 306        int pos;
 307        uint64_t var;
 308        uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 309        __m128i crc_adjust = _mm_set_epi16
 310                                (0, 0, 0,       /* ignore non-length fields */
 311                                 -rxq->crc_len, /* sub crc on data_len */
 312                                 0,          /* ignore high-16bits of pkt_len */
 313                                 -rxq->crc_len, /* sub crc on pkt_len */
 314                                 0, 0           /* ignore pkt_type field */
 315                                );
 316        const __m128i zero = _mm_setzero_si128();
 317        /* mask to shuffle from desc. to mbuf */
 318        const __m128i shuf_msk = _mm_set_epi8
 319                        (0xFF, 0xFF,
 320                         0xFF, 0xFF,  /* rss hash parsed separately */
 321                         11, 10,      /* octet 10~11, 16 bits vlan_macip */
 322                         5, 4,        /* octet 4~5, 16 bits data_len */
 323                         0xFF, 0xFF,  /* skip high 16 bits pkt_len, zero out */
 324                         5, 4,        /* octet 4~5, low 16 bits pkt_len */
 325                         0xFF, 0xFF,  /* pkt_type set as unknown */
 326                         0xFF, 0xFF   /* pkt_type set as unknown */
 327                        );
 328        const __m128i eop_shuf_mask = _mm_set_epi8(0xFF, 0xFF,
 329                                                   0xFF, 0xFF,
 330                                                   0xFF, 0xFF,
 331                                                   0xFF, 0xFF,
 332                                                   0xFF, 0xFF,
 333                                                   0xFF, 0xFF,
 334                                                   0x04, 0x0C,
 335                                                   0x00, 0x08);
 336
 337        /**
 338         * compile-time check the above crc_adjust layout is correct.
 339         * NOTE: the first field (lowest address) is given last in set_epi16
 340         * call above.
 341         */
 342        RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
 343                         offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
 344        RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
 345                         offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
 346
 347        /* 4 packets DD mask */
 348        const __m128i dd_check = _mm_set_epi64x(0x0000000100000001LL,
 349                                                0x0000000100000001LL);
 350        /* 4 packets EOP mask */
 351        const __m128i eop_check = _mm_set_epi64x(0x0000000200000002LL,
 352                                                 0x0000000200000002LL);
 353
 354        /* nb_pkts has to be floor-aligned to ICE_DESCS_PER_LOOP */
 355        nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, ICE_DESCS_PER_LOOP);
 356
 357        /* Just the act of getting into the function from the application is
 358         * going to cost about 7 cycles
 359         */
 360        rxdp = rxq->rx_ring + rxq->rx_tail;
 361
 362        rte_prefetch0(rxdp);
 363
 364        /* See if we need to rearm the RX queue - gives the prefetch a bit
 365         * of time to act
 366         */
 367        if (rxq->rxrearm_nb > ICE_RXQ_REARM_THRESH)
 368                ice_rxq_rearm(rxq);
 369
 370        /* Before we start moving massive data around, check to see if
 371         * there is actually a packet available
 372         */
 373        if (!(rxdp->wb.status_error0 &
 374              rte_cpu_to_le_32(1 << ICE_RX_FLEX_DESC_STATUS0_DD_S)))
 375                return 0;
 376
 377        /**
 378         * Compile-time verify the shuffle mask
 379         * NOTE: some field positions already verified above, but duplicated
 380         * here for completeness in case of future modifications.
 381         */
 382        RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
 383                         offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
 384        RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
 385                         offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
 386        RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
 387                         offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
 388        RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
 389                         offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
 390
 391        /* Cache is empty -> need to scan the buffer rings, but first move
 392         * the next 'n' mbufs into the cache
 393         */
 394        sw_ring = &rxq->sw_ring[rxq->rx_tail];
 395
 396        /* A. load 4 packet in one loop
 397         * [A*. mask out 4 unused dirty field in desc]
 398         * B. copy 4 mbuf point from swring to rx_pkts
 399         * C. calc the number of DD bits among the 4 packets
 400         * [C*. extract the end-of-packet bit, if requested]
 401         * D. fill info. from desc to mbuf
 402         */
 403
 404        for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts;
 405             pos += ICE_DESCS_PER_LOOP,
 406             rxdp += ICE_DESCS_PER_LOOP) {
 407                __m128i descs[ICE_DESCS_PER_LOOP];
 408                __m128i pkt_mb0, pkt_mb1, pkt_mb2, pkt_mb3;
 409                __m128i staterr, sterr_tmp1, sterr_tmp2;
 410                /* 2 64 bit or 4 32 bit mbuf pointers in one XMM reg. */
 411                __m128i mbp1;
 412#if defined(RTE_ARCH_X86_64)
 413                __m128i mbp2;
 414#endif
 415
 416                /* B.1 load 2 (64 bit) or 4 (32 bit) mbuf points */
 417                mbp1 = _mm_loadu_si128((__m128i *)&sw_ring[pos]);
 418                /* Read desc statuses backwards to avoid race condition */
 419                /* A.1 load desc[3] */
 420                descs[3] = _mm_loadu_si128((__m128i *)(rxdp + 3));
 421                rte_compiler_barrier();
 422
 423                /* B.2 copy 2 64 bit or 4 32 bit mbuf point into rx_pkts */
 424                _mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1);
 425
 426#if defined(RTE_ARCH_X86_64)
 427                /* B.1 load 2 64 bit mbuf points */
 428                mbp2 = _mm_loadu_si128((__m128i *)&sw_ring[pos + 2]);
 429#endif
 430
 431                /* A.1 load desc[2-0] */
 432                descs[2] = _mm_loadu_si128((__m128i *)(rxdp + 2));
 433                rte_compiler_barrier();
 434                descs[1] = _mm_loadu_si128((__m128i *)(rxdp + 1));
 435                rte_compiler_barrier();
 436                descs[0] = _mm_loadu_si128((__m128i *)(rxdp));
 437
 438#if defined(RTE_ARCH_X86_64)
 439                /* B.2 copy 2 mbuf point into rx_pkts  */
 440                _mm_storeu_si128((__m128i *)&rx_pkts[pos + 2], mbp2);
 441#endif
 442
 443                if (split_packet) {
 444                        rte_mbuf_prefetch_part2(rx_pkts[pos]);
 445                        rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
 446                        rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
 447                        rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
 448                }
 449
 450                /* avoid compiler reorder optimization */
 451                rte_compiler_barrier();
 452
 453                /* D.1 pkt 3,4 convert format from desc to pktmbuf */
 454                pkt_mb3 = _mm_shuffle_epi8(descs[3], shuf_msk);
 455                pkt_mb2 = _mm_shuffle_epi8(descs[2], shuf_msk);
 456
 457                /* D.1 pkt 1,2 convert format from desc to pktmbuf */
 458                pkt_mb1 = _mm_shuffle_epi8(descs[1], shuf_msk);
 459                pkt_mb0 = _mm_shuffle_epi8(descs[0], shuf_msk);
 460
 461                /* C.1 4=>2 filter staterr info only */
 462                sterr_tmp2 = _mm_unpackhi_epi32(descs[3], descs[2]);
 463                /* C.1 4=>2 filter staterr info only */
 464                sterr_tmp1 = _mm_unpackhi_epi32(descs[1], descs[0]);
 465
 466                ice_rx_desc_to_olflags_v(rxq, descs, &rx_pkts[pos]);
 467
 468                /* D.2 pkt 3,4 set in_port/nb_seg and remove crc */
 469                pkt_mb3 = _mm_add_epi16(pkt_mb3, crc_adjust);
 470                pkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust);
 471
 472                /* D.2 pkt 1,2 set in_port/nb_seg and remove crc */
 473                pkt_mb1 = _mm_add_epi16(pkt_mb1, crc_adjust);
 474                pkt_mb0 = _mm_add_epi16(pkt_mb0, crc_adjust);
 475
 476#ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC
 477                /**
 478                 * needs to load 2nd 16B of each desc for RSS hash parsing,
 479                 * will cause performance drop to get into this context.
 480                 */
 481                if (rxq->vsi->adapter->pf.dev_data->dev_conf.rxmode.offloads &
 482                                RTE_ETH_RX_OFFLOAD_RSS_HASH) {
 483                        /* load bottom half of every 32B desc */
 484                        const __m128i raw_desc_bh3 =
 485                                _mm_load_si128
 486                                        ((void *)(&rxdp[3].wb.status_error1));
 487                        rte_compiler_barrier();
 488                        const __m128i raw_desc_bh2 =
 489                                _mm_load_si128
 490                                        ((void *)(&rxdp[2].wb.status_error1));
 491                        rte_compiler_barrier();
 492                        const __m128i raw_desc_bh1 =
 493                                _mm_load_si128
 494                                        ((void *)(&rxdp[1].wb.status_error1));
 495                        rte_compiler_barrier();
 496                        const __m128i raw_desc_bh0 =
 497                                _mm_load_si128
 498                                        ((void *)(&rxdp[0].wb.status_error1));
 499
 500                        /**
 501                         * to shift the 32b RSS hash value to the
 502                         * highest 32b of each 128b before mask
 503                         */
 504                        __m128i rss_hash3 =
 505                                _mm_slli_epi64(raw_desc_bh3, 32);
 506                        __m128i rss_hash2 =
 507                                _mm_slli_epi64(raw_desc_bh2, 32);
 508                        __m128i rss_hash1 =
 509                                _mm_slli_epi64(raw_desc_bh1, 32);
 510                        __m128i rss_hash0 =
 511                                _mm_slli_epi64(raw_desc_bh0, 32);
 512
 513                        __m128i rss_hash_msk =
 514                                _mm_set_epi32(0xFFFFFFFF, 0, 0, 0);
 515
 516                        rss_hash3 = _mm_and_si128
 517                                        (rss_hash3, rss_hash_msk);
 518                        rss_hash2 = _mm_and_si128
 519                                        (rss_hash2, rss_hash_msk);
 520                        rss_hash1 = _mm_and_si128
 521                                        (rss_hash1, rss_hash_msk);
 522                        rss_hash0 = _mm_and_si128
 523                                        (rss_hash0, rss_hash_msk);
 524
 525                        pkt_mb3 = _mm_or_si128(pkt_mb3, rss_hash3);
 526                        pkt_mb2 = _mm_or_si128(pkt_mb2, rss_hash2);
 527                        pkt_mb1 = _mm_or_si128(pkt_mb1, rss_hash1);
 528                        pkt_mb0 = _mm_or_si128(pkt_mb0, rss_hash0);
 529                } /* if() on RSS hash parsing */
 530#endif
 531
 532                /* C.2 get 4 pkts staterr value  */
 533                staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2);
 534
 535                /* D.3 copy final 3,4 data to rx_pkts */
 536                _mm_storeu_si128
 537                        ((void *)&rx_pkts[pos + 3]->rx_descriptor_fields1,
 538                         pkt_mb3);
 539                _mm_storeu_si128
 540                        ((void *)&rx_pkts[pos + 2]->rx_descriptor_fields1,
 541                         pkt_mb2);
 542
 543                /* C* extract and record EOP bit */
 544                if (split_packet) {
 545                        /* and with mask to extract bits, flipping 1-0 */
 546                        __m128i eop_bits = _mm_andnot_si128(staterr, eop_check);
 547                        /* the staterr values are not in order, as the count
 548                         * of dd bits doesn't care. However, for end of
 549                         * packet tracking, we do care, so shuffle. This also
 550                         * compresses the 32-bit values to 8-bit
 551                         */
 552                        eop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask);
 553                        /* store the resulting 32-bit value */
 554                        *(int *)split_packet = _mm_cvtsi128_si32(eop_bits);
 555                        split_packet += ICE_DESCS_PER_LOOP;
 556                }
 557
 558                /* C.3 calc available number of desc */
 559                staterr = _mm_and_si128(staterr, dd_check);
 560                staterr = _mm_packs_epi32(staterr, zero);
 561
 562                /* D.3 copy final 1,2 data to rx_pkts */
 563                _mm_storeu_si128
 564                        ((void *)&rx_pkts[pos + 1]->rx_descriptor_fields1,
 565                         pkt_mb1);
 566                _mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1,
 567                                 pkt_mb0);
 568                ice_rx_desc_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl);
 569                /* C.4 calc available number of desc */
 570                var = __builtin_popcountll(_mm_cvtsi128_si64(staterr));
 571                nb_pkts_recd += var;
 572                if (likely(var != ICE_DESCS_PER_LOOP))
 573                        break;
 574        }
 575
 576        /* Update our internal tail pointer */
 577        rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd);
 578        rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1));
 579        rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd);
 580
 581        return nb_pkts_recd;
 582}
 583
 584/**
 585 * Notice:
 586 * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
 587 * - nb_pkts > ICE_VPMD_RX_BURST, only scan ICE_VPMD_RX_BURST
 588 *   numbers of DD bits
 589 */
 590uint16_t
 591ice_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 592                  uint16_t nb_pkts)
 593{
 594        return _ice_recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 595}
 596
 597/**
 598 * vPMD receive routine that reassembles single burst of 32 scattered packets
 599 *
 600 * Notice:
 601 * - nb_pkts < ICE_DESCS_PER_LOOP, just return no packet
 602 */
 603static uint16_t
 604ice_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 605                             uint16_t nb_pkts)
 606{
 607        struct ice_rx_queue *rxq = rx_queue;
 608        uint8_t split_flags[ICE_VPMD_RX_BURST] = {0};
 609
 610        /* get some new buffers */
 611        uint16_t nb_bufs = _ice_recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
 612                                                  split_flags);
 613        if (nb_bufs == 0)
 614                return 0;
 615
 616        /* happy day case, full burst + no packets to be joined */
 617        const uint64_t *split_fl64 = (uint64_t *)split_flags;
 618
 619        if (!rxq->pkt_first_seg &&
 620            split_fl64[0] == 0 && split_fl64[1] == 0 &&
 621            split_fl64[2] == 0 && split_fl64[3] == 0)
 622                return nb_bufs;
 623
 624        /* reassemble any packets that need reassembly*/
 625        unsigned int i = 0;
 626
 627        if (!rxq->pkt_first_seg) {
 628                /* find the first split flag, and only reassemble then*/
 629                while (i < nb_bufs && !split_flags[i])
 630                        i++;
 631                if (i == nb_bufs)
 632                        return nb_bufs;
 633                rxq->pkt_first_seg = rx_pkts[i];
 634        }
 635        return i + ice_rx_reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i,
 636                                             &split_flags[i]);
 637}
 638
 639/**
 640 * vPMD receive routine that reassembles scattered packets.
 641 */
 642uint16_t
 643ice_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 644                            uint16_t nb_pkts)
 645{
 646        uint16_t retval = 0;
 647
 648        while (nb_pkts > ICE_VPMD_RX_BURST) {
 649                uint16_t burst;
 650
 651                burst = ice_recv_scattered_burst_vec(rx_queue,
 652                                                     rx_pkts + retval,
 653                                                     ICE_VPMD_RX_BURST);
 654                retval += burst;
 655                nb_pkts -= burst;
 656                if (burst < ICE_VPMD_RX_BURST)
 657                        return retval;
 658        }
 659
 660        return retval + ice_recv_scattered_burst_vec(rx_queue,
 661                                                     rx_pkts + retval,
 662                                                     nb_pkts);
 663}
 664
 665static inline void
 666ice_vtx1(volatile struct ice_tx_desc *txdp, struct rte_mbuf *pkt,
 667         uint64_t flags)
 668{
 669        uint64_t high_qw =
 670                (ICE_TX_DESC_DTYPE_DATA |
 671                 ((uint64_t)flags  << ICE_TXD_QW1_CMD_S) |
 672                 ((uint64_t)pkt->data_len << ICE_TXD_QW1_TX_BUF_SZ_S));
 673
 674        __m128i descriptor = _mm_set_epi64x(high_qw,
 675                                            pkt->buf_iova + pkt->data_off);
 676        _mm_store_si128((__m128i *)txdp, descriptor);
 677}
 678
 679static inline void
 680ice_vtx(volatile struct ice_tx_desc *txdp, struct rte_mbuf **pkt,
 681        uint16_t nb_pkts, uint64_t flags)
 682{
 683        int i;
 684
 685        for (i = 0; i < nb_pkts; ++i, ++txdp, ++pkt)
 686                ice_vtx1(txdp, *pkt, flags);
 687}
 688
 689static uint16_t
 690ice_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 691                         uint16_t nb_pkts)
 692{
 693        struct ice_tx_queue *txq = (struct ice_tx_queue *)tx_queue;
 694        volatile struct ice_tx_desc *txdp;
 695        struct ice_tx_entry *txep;
 696        uint16_t n, nb_commit, tx_id;
 697        uint64_t flags = ICE_TD_CMD;
 698        uint64_t rs = ICE_TX_DESC_CMD_RS | ICE_TD_CMD;
 699        int i;
 700
 701        /* cross rx_thresh boundary is not allowed */
 702        nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);
 703
 704        if (txq->nb_tx_free < txq->tx_free_thresh)
 705                ice_tx_free_bufs_vec(txq);
 706
 707        nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
 708        nb_commit = nb_pkts;
 709        if (unlikely(nb_pkts == 0))
 710                return 0;
 711
 712        tx_id = txq->tx_tail;
 713        txdp = &txq->tx_ring[tx_id];
 714        txep = &txq->sw_ring[tx_id];
 715
 716        txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
 717
 718        n = (uint16_t)(txq->nb_tx_desc - tx_id);
 719        if (nb_commit >= n) {
 720                ice_tx_backlog_entry(txep, tx_pkts, n);
 721
 722                for (i = 0; i < n - 1; ++i, ++tx_pkts, ++txdp)
 723                        ice_vtx1(txdp, *tx_pkts, flags);
 724
 725                ice_vtx1(txdp, *tx_pkts++, rs);
 726
 727                nb_commit = (uint16_t)(nb_commit - n);
 728
 729                tx_id = 0;
 730                txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
 731
 732                /* avoid reach the end of ring */
 733                txdp = &txq->tx_ring[tx_id];
 734                txep = &txq->sw_ring[tx_id];
 735        }
 736
 737        ice_tx_backlog_entry(txep, tx_pkts, nb_commit);
 738
 739        ice_vtx(txdp, tx_pkts, nb_commit, flags);
 740
 741        tx_id = (uint16_t)(tx_id + nb_commit);
 742        if (tx_id > txq->tx_next_rs) {
 743                txq->tx_ring[txq->tx_next_rs].cmd_type_offset_bsz |=
 744                        rte_cpu_to_le_64(((uint64_t)ICE_TX_DESC_CMD_RS) <<
 745                                         ICE_TXD_QW1_CMD_S);
 746                txq->tx_next_rs =
 747                        (uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh);
 748        }
 749
 750        txq->tx_tail = tx_id;
 751
 752        ICE_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
 753
 754        return nb_pkts;
 755}
 756
 757uint16_t
 758ice_xmit_pkts_vec(void *tx_queue, struct rte_mbuf **tx_pkts,
 759                  uint16_t nb_pkts)
 760{
 761        uint16_t nb_tx = 0;
 762        struct ice_tx_queue *txq = (struct ice_tx_queue *)tx_queue;
 763
 764        while (nb_pkts) {
 765                uint16_t ret, num;
 766
 767                num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh);
 768                ret = ice_xmit_fixed_burst_vec(tx_queue, &tx_pkts[nb_tx], num);
 769                nb_tx += ret;
 770                nb_pkts -= ret;
 771                if (ret < num)
 772                        break;
 773        }
 774
 775        return nb_tx;
 776}
 777
 778int __rte_cold
 779ice_rxq_vec_setup(struct ice_rx_queue *rxq)
 780{
 781        if (!rxq)
 782                return -1;
 783
 784        rxq->rx_rel_mbufs = _ice_rx_queue_release_mbufs_vec;
 785        return ice_rxq_vec_setup_default(rxq);
 786}
 787
 788int __rte_cold
 789ice_txq_vec_setup(struct ice_tx_queue __rte_unused *txq)
 790{
 791        if (!txq)
 792                return -1;
 793
 794        txq->tx_rel_mbufs = _ice_tx_queue_release_mbufs_vec;
 795        return 0;
 796}
 797
 798int __rte_cold
 799ice_rx_vec_dev_check(struct rte_eth_dev *dev)
 800{
 801        return ice_rx_vec_dev_check_default(dev);
 802}
 803
 804int __rte_cold
 805ice_tx_vec_dev_check(struct rte_eth_dev *dev)
 806{
 807        return ice_tx_vec_dev_check_default(dev);
 808}
 809