dpdk/drivers/net/enic/enic_rxtx_vec_avx2.c
<<
>>
Prefs
   1/* SPDX-License-Identifier: BSD-3-Clause
   2 * Copyright 2008-2018 Cisco Systems, Inc.  All rights reserved.
   3 * Copyright 2007 Nuova Systems, Inc.  All rights reserved.
   4 */
   5
   6#include <rte_mbuf.h>
   7#include <rte_ethdev_driver.h>
   8#include <rte_vect.h>
   9
  10#include "enic_compat.h"
  11#include "rq_enet_desc.h"
  12#include "enic.h"
  13#include "enic_rxtx_common.h"
  14
  15#include <x86intrin.h>
  16
  17static struct rte_mbuf *
  18rx_one(struct cq_enet_rq_desc *cqd, struct rte_mbuf *mb, struct enic *enic)
  19{
  20        bool tnl;
  21
  22        *(uint64_t *)&mb->rearm_data = enic->mbuf_initializer;
  23        mb->data_len = cqd->bytes_written_flags &
  24                CQ_ENET_RQ_DESC_BYTES_WRITTEN_MASK;
  25        mb->pkt_len = mb->data_len;
  26        tnl = enic->overlay_offload && (cqd->completed_index_flags &
  27                                        CQ_ENET_RQ_DESC_FLAGS_FCOE) != 0;
  28        mb->packet_type =
  29                enic_cq_rx_flags_to_pkt_type((struct cq_desc *)cqd, tnl);
  30        enic_cq_rx_to_pkt_flags((struct cq_desc *)cqd, mb);
  31        /* Wipe the outer types set by enic_cq_rx_flags_to_pkt_type() */
  32        if (tnl) {
  33                mb->packet_type &= ~(RTE_PTYPE_L3_MASK |
  34                                     RTE_PTYPE_L4_MASK);
  35        }
  36        return mb;
  37}
  38
  39static uint16_t
  40enic_noscatter_vec_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
  41                             uint16_t nb_pkts)
  42{
  43        struct rte_mbuf **rx, **rxmb;
  44        uint16_t cq_idx, nb_rx, max_rx;
  45        struct cq_enet_rq_desc *cqd;
  46        struct rq_enet_desc *rqd;
  47        struct vnic_cq *cq;
  48        struct vnic_rq *rq;
  49        struct enic *enic;
  50        uint8_t color;
  51
  52        rq = rx_queue;
  53        enic = vnic_dev_priv(rq->vdev);
  54        cq = &enic->cq[enic_cq_rq(enic, rq->index)];
  55        cq_idx = cq->to_clean;
  56
  57        /*
  58         * Fill up the reserve of free mbufs. Below, we restock the receive
  59         * ring with these mbufs to avoid allocation failures.
  60         */
  61        if (rq->num_free_mbufs == 0) {
  62                if (rte_mempool_get_bulk(rq->mp, (void **)rq->free_mbufs,
  63                                         ENIC_RX_BURST_MAX))
  64                        return 0;
  65                rq->num_free_mbufs = ENIC_RX_BURST_MAX;
  66        }
  67        /* Receive until the end of the ring, at most. */
  68        max_rx = RTE_MIN(nb_pkts, rq->num_free_mbufs);
  69        max_rx = RTE_MIN(max_rx, cq->ring.desc_count - cq_idx);
  70
  71        rxmb = rq->mbuf_ring + cq_idx;
  72        color = cq->last_color;
  73        cqd = (struct cq_enet_rq_desc *)(cq->ring.descs) + cq_idx;
  74        rx = rx_pkts;
  75        if (max_rx == 0 ||
  76            (cqd->type_color & CQ_DESC_COLOR_MASK_NOSHIFT) == color)
  77                return 0;
  78
  79        /* Step 1: Process one packet to do aligned 256-bit load below */
  80        if (cq_idx & 0x1) {
  81                if (unlikely(cqd->bytes_written_flags &
  82                             CQ_ENET_RQ_DESC_FLAGS_TRUNCATED)) {
  83                        rte_pktmbuf_free(*rxmb++);
  84                        rte_atomic64_inc(&enic->soft_stats.rx_packet_errors);
  85                } else {
  86                        *rx++ = rx_one(cqd, *rxmb++, enic);
  87                }
  88                cqd++;
  89                max_rx--;
  90        }
  91
  92        const __m256i mask =
  93                _mm256_set_epi8(/* Second descriptor */
  94                        0xff, /* type_color */
  95                        (CQ_ENET_RQ_DESC_FLAGS_IPV4_FRAGMENT |
  96                         CQ_ENET_RQ_DESC_FLAGS_IPV4 |
  97                         CQ_ENET_RQ_DESC_FLAGS_IPV6 |
  98                         CQ_ENET_RQ_DESC_FLAGS_TCP |
  99                         CQ_ENET_RQ_DESC_FLAGS_UDP), /* flags */
 100                        0, 0, /* checksum_fcoe */
 101                        0xff, 0xff, /* vlan */
 102                        0x3f, 0xff, /* bytes_written_flags */
 103                        0xff, 0xff, 0xff, 0xff, /* rss_hash */
 104                        0xff, 0xff, /* q_number_rss_type_flags */
 105                        0, 0, /* completed_index_flags */
 106                        /* First descriptor */
 107                        0xff, /* type_color */
 108                        (CQ_ENET_RQ_DESC_FLAGS_IPV4_FRAGMENT |
 109                         CQ_ENET_RQ_DESC_FLAGS_IPV4 |
 110                         CQ_ENET_RQ_DESC_FLAGS_IPV6 |
 111                         CQ_ENET_RQ_DESC_FLAGS_TCP |
 112                         CQ_ENET_RQ_DESC_FLAGS_UDP), /* flags */
 113                        0, 0, /* checksum_fcoe */
 114                        0xff, 0xff, /* vlan */
 115                        0x3f, 0xff, /* bytes_written_flags */
 116                        0xff, 0xff, 0xff, 0xff, /* rss_hash */
 117                        0xff, 0xff, /* q_number_rss_type_flags */
 118                        0, 0 /* completed_index_flags */
 119                        );
 120        const __m256i shuffle_mask =
 121                _mm256_set_epi8(/* Second descriptor */
 122                        7, 6, 5, 4,             /* rss = rss_hash */
 123                        11, 10,                 /* vlan_tci = vlan */
 124                        9, 8,                   /* data_len = bytes_written */
 125                        0x80, 0x80, 9, 8,       /* pkt_len = bytes_written */
 126                        0x80, 0x80, 0x80, 0x80, /* packet_type = 0 */
 127                        /* First descriptor */
 128                        7, 6, 5, 4,             /* rss = rss_hash */
 129                        11, 10,                 /* vlan_tci = vlan */
 130                        9, 8,                   /* data_len = bytes_written */
 131                        0x80, 0x80, 9, 8,       /* pkt_len = bytes_written */
 132                        0x80, 0x80, 0x80, 0x80  /* packet_type = 0 */
 133                        );
 134        /* Used to collect 8 flags from 8 desc into one register */
 135        const __m256i flags_shuffle_mask =
 136                _mm256_set_epi8(/* Second descriptor */
 137                        1, 3, 9, 14,
 138                        1, 3, 9, 14,
 139                        1, 3, 9, 14,
 140                        1, 3, 9, 14,
 141                        /* First descriptor */
 142                        1, 3, 9, 14,
 143                        1, 3, 9, 14,
 144                        1, 3, 9, 14,
 145                        /*
 146                         * Byte 3: upper byte of completed_index_flags
 147                         *         bit 5 = fcoe (tunnel)
 148                         * Byte 2: upper byte of q_number_rss_type_flags
 149                         *         bits 2,3,4,5 = rss type
 150                         *         bit 6 = csum_not_calc
 151                         * Byte 1: upper byte of bytes_written_flags
 152                         *         bit 6 = truncated
 153                         *         bit 7 = vlan stripped
 154                         * Byte 0: flags
 155                         */
 156                        1, 3, 9, 14
 157                        );
 158        /* Used to collect 8 VLAN IDs from 8 desc into one register */
 159        const __m256i vlan_shuffle_mask =
 160                _mm256_set_epi8(/* Second descriptor */
 161                        0x80, 0x80, 11, 10,
 162                        0x80, 0x80, 11, 10,
 163                        0x80, 0x80, 11, 10,
 164                        0x80, 0x80, 11, 10,
 165                        /* First descriptor */
 166                        0x80, 0x80, 11, 10,
 167                        0x80, 0x80, 11, 10,
 168                        0x80, 0x80, 11, 10,
 169                        0x80, 0x80, 11, 10);
 170        /* PKT_RX_RSS_HASH is 1<<1 so fits in 8-bit integer */
 171        const __m256i rss_shuffle =
 172                _mm256_set_epi8(/* second 128 bits */
 173                        PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
 174                        PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
 175                        PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
 176                        PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
 177                        PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
 178                        0, /* rss_types = 0 */
 179                        /* first 128 bits */
 180                        PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
 181                        PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
 182                        PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
 183                        PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
 184                        PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH,
 185                        0 /* rss_types = 0 */);
 186        /*
 187         * VLAN offload flags.
 188         * shuffle index:
 189         * vlan_stripped => bit 0
 190         * vlan_id == 0  => bit 1
 191         */
 192        const __m256i vlan_shuffle =
 193                _mm256_set_epi32(0, 0, 0, 0,
 194                        PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0,
 195                        PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, PKT_RX_VLAN);
 196        /* Use the same shuffle index as vlan_shuffle */
 197        const __m256i vlan_ptype_shuffle =
 198                _mm256_set_epi32(0, 0, 0, 0,
 199                                 RTE_PTYPE_L2_ETHER,
 200                                 RTE_PTYPE_L2_ETHER,
 201                                 RTE_PTYPE_L2_ETHER,
 202                                 RTE_PTYPE_L2_ETHER_VLAN);
 203        /*
 204         * CKSUM flags. Shift right so they fit int 8-bit integers.
 205         * shuffle index:
 206         * ipv4_csum_ok    => bit 3
 207         * ip4             => bit 2
 208         * tcp_or_udp      => bit 1
 209         * tcp_udp_csum_ok => bit 0
 210         */
 211        const __m256i csum_shuffle =
 212                _mm256_set_epi8(/* second 128 bits */
 213                        /* 1111 ip4+ip4_ok+l4+l4_ok */
 214                        ((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1),
 215                        /* 1110 ip4_ok+ip4+l4+!l4_ok */
 216                        ((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1),
 217                        (PKT_RX_IP_CKSUM_GOOD >> 1), /* 1101 ip4+ip4_ok */
 218                        (PKT_RX_IP_CKSUM_GOOD >> 1), /* 1100 ip4_ok+ip4 */
 219                        (PKT_RX_L4_CKSUM_GOOD >> 1), /* 1011 l4+l4_ok */
 220                        (PKT_RX_L4_CKSUM_BAD >> 1),  /* 1010 l4+!l4_ok */
 221                        0, /* 1001 */
 222                        0, /* 1000 */
 223                        /* 0111 !ip4_ok+ip4+l4+l4_ok */
 224                        ((PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD) >> 1),
 225                        /* 0110 !ip4_ok+ip4+l4+!l4_ok */
 226                        ((PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD) >> 1),
 227                        (PKT_RX_IP_CKSUM_BAD >> 1),  /* 0101 !ip4_ok+ip4 */
 228                        (PKT_RX_IP_CKSUM_BAD >> 1),  /* 0100 !ip4_ok+ip4 */
 229                        (PKT_RX_L4_CKSUM_GOOD >> 1), /* 0011 l4+l4_ok */
 230                        (PKT_RX_L4_CKSUM_BAD >> 1),  /* 0010 l4+!l4_ok */
 231                        0, /* 0001 */
 232                        0, /* 0000 */
 233                        /* first 128 bits */
 234                        ((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1),
 235                        ((PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1),
 236                        (PKT_RX_IP_CKSUM_GOOD >> 1),
 237                        (PKT_RX_IP_CKSUM_GOOD >> 1),
 238                        (PKT_RX_L4_CKSUM_GOOD >> 1),
 239                        (PKT_RX_L4_CKSUM_BAD >> 1),
 240                        0, 0,
 241                        ((PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD) >> 1),
 242                        ((PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD) >> 1),
 243                        (PKT_RX_IP_CKSUM_BAD >> 1),
 244                        (PKT_RX_IP_CKSUM_BAD >> 1),
 245                        (PKT_RX_L4_CKSUM_GOOD >> 1),
 246                        (PKT_RX_L4_CKSUM_BAD >> 1),
 247                        0, 0);
 248        /*
 249         * Non-fragment PTYPEs.
 250         * Shuffle 4-bit index:
 251         * ip6 => bit 0
 252         * ip4 => bit 1
 253         * udp => bit 2
 254         * tcp => bit 3
 255         *   bit
 256         * 3 2 1 0
 257         * -------
 258         * 0 0 0 0 unknown
 259         * 0 0 0 1 ip6 | nonfrag
 260         * 0 0 1 0 ip4 | nonfrag
 261         * 0 0 1 1 unknown
 262         * 0 1 0 0 unknown
 263         * 0 1 0 1 ip6 | udp
 264         * 0 1 1 0 ip4 | udp
 265         * 0 1 1 1 unknown
 266         * 1 0 0 0 unknown
 267         * 1 0 0 1 ip6 | tcp
 268         * 1 0 1 0 ip4 | tcp
 269         * 1 0 1 1 unknown
 270         * 1 1 0 0 unknown
 271         * 1 1 0 1 unknown
 272         * 1 1 1 0 unknown
 273         * 1 1 1 1 unknown
 274         *
 275         * PTYPEs do not fit in 8 bits, so shift right 4..
 276         */
 277        const __m256i nonfrag_ptype_shuffle =
 278                _mm256_set_epi8(/* second 128 bits */
 279                        RTE_PTYPE_UNKNOWN,
 280                        RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
 281                        RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
 282                        (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4,
 283                        (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4,
 284                        RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
 285                        (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4,
 286                        (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4,
 287                        RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
 288                        (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
 289                         RTE_PTYPE_L4_NONFRAG) >> 4,
 290                        (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
 291                         RTE_PTYPE_L4_NONFRAG) >> 4,
 292                        RTE_PTYPE_UNKNOWN,
 293                        /* first 128 bits */
 294                        RTE_PTYPE_UNKNOWN,
 295                        RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
 296                        RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
 297                        (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4,
 298                        (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_TCP) >> 4,
 299                        RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
 300                        (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4,
 301                        (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN | RTE_PTYPE_L4_UDP) >> 4,
 302                        RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
 303                        (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
 304                         RTE_PTYPE_L4_NONFRAG) >> 4,
 305                        (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
 306                         RTE_PTYPE_L4_NONFRAG) >> 4,
 307                        RTE_PTYPE_UNKNOWN);
 308        /* Fragment PTYPEs. Use the same shuffle index as above. */
 309        const __m256i frag_ptype_shuffle =
 310                _mm256_set_epi8(/* second 128 bits */
 311                        RTE_PTYPE_UNKNOWN,
 312                        RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
 313                        RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
 314                        (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
 315                         RTE_PTYPE_L4_FRAG) >> 4,
 316                        (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
 317                         RTE_PTYPE_L4_FRAG) >> 4,
 318                        RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
 319                        (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
 320                         RTE_PTYPE_L4_FRAG) >> 4,
 321                        (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
 322                         RTE_PTYPE_L4_FRAG) >> 4,
 323                        RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
 324                        (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
 325                         RTE_PTYPE_L4_FRAG) >> 4,
 326                        (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
 327                         RTE_PTYPE_L4_FRAG) >> 4,
 328                        RTE_PTYPE_UNKNOWN,
 329                        /* first 128 bits */
 330                        RTE_PTYPE_UNKNOWN,
 331                        RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
 332                        RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
 333                        (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
 334                         RTE_PTYPE_L4_FRAG) >> 4,
 335                        (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
 336                         RTE_PTYPE_L4_FRAG) >> 4,
 337                        RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
 338                        (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
 339                         RTE_PTYPE_L4_FRAG) >> 4,
 340                        (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
 341                         RTE_PTYPE_L4_FRAG) >> 4,
 342                        RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
 343                        (RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
 344                         RTE_PTYPE_L4_FRAG) >> 4,
 345                        (RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
 346                         RTE_PTYPE_L4_FRAG) >> 4,
 347                        RTE_PTYPE_UNKNOWN);
 348        /*
 349         * Tunnel PTYPEs. Use the same shuffle index as above.
 350         * L4 types are not part of this table. They come from non-tunnel
 351         * types above.
 352         */
 353        const __m256i tnl_l3_ptype_shuffle =
 354                _mm256_set_epi8(/* second 128 bits */
 355                        RTE_PTYPE_UNKNOWN,
 356                        RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
 357                        RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
 358                        RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
 359                        RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
 360                        RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
 361                        RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
 362                        RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
 363                        RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
 364                        RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
 365                        RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
 366                        RTE_PTYPE_UNKNOWN,
 367                        /* first 128 bits */
 368                        RTE_PTYPE_UNKNOWN,
 369                        RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
 370                        RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
 371                        RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
 372                        RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
 373                        RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
 374                        RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
 375                        RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
 376                        RTE_PTYPE_UNKNOWN, RTE_PTYPE_UNKNOWN,
 377                        RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN >> 16,
 378                        RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN >> 16,
 379                        RTE_PTYPE_UNKNOWN);
 380
 381        const __m256i mbuf_init = _mm256_set_epi64x(0, enic->mbuf_initializer,
 382                                                    0, enic->mbuf_initializer);
 383
 384        /*
 385         * --- cq desc fields ---    offset
 386         * completed_index_flags    - 0   use: fcoe
 387         * q_number_rss_type_flags  - 2   use: rss types, csum_not_calc
 388         * rss_hash                 - 4   ==> mbuf.hash.rss
 389         * bytes_written_flags      - 8   ==> mbuf.pkt_len,data_len
 390         *                                use: truncated, vlan_stripped
 391         * vlan                     - 10  ==> mbuf.vlan_tci
 392         * checksum_fcoe            - 12  (unused)
 393         * flags                    - 14  use: all bits
 394         * type_color               - 15  (unused)
 395         *
 396         * --- mbuf fields ---       offset
 397         * rearm_data              ---- 16
 398         * data_off    - 0      (mbuf_init) -+
 399         * refcnt      - 2      (mbuf_init)  |
 400         * nb_segs     - 4      (mbuf_init)  | 16B 128b
 401         * port        - 6      (mbuf_init)  |
 402         * ol_flag     - 8      (from cqd)  -+
 403         * rx_descriptor_fields1   ---- 32
 404         * packet_type - 0      (from cqd)  -+
 405         * pkt_len     - 4      (from cqd)   |
 406         * data_len    - 8      (from cqd)   | 16B 128b
 407         * vlan_tci    - 10     (from cqd)   |
 408         * rss         - 12     (from cqd)  -+
 409         */
 410
 411        __m256i overlay_enabled =
 412                _mm256_set1_epi32((uint32_t)enic->overlay_offload);
 413
 414        /* Step 2: Process 8 packets per loop using SIMD */
 415        while (max_rx > 7 && (((cqd + 7)->type_color &
 416                               CQ_DESC_COLOR_MASK_NOSHIFT) != color)) {
 417                /* Load 8 16B CQ descriptors */
 418                __m256i cqd01 = _mm256_load_si256((void *)cqd);
 419                __m256i cqd23 = _mm256_load_si256((void *)(cqd + 2));
 420                __m256i cqd45 = _mm256_load_si256((void *)(cqd + 4));
 421                __m256i cqd67 = _mm256_load_si256((void *)(cqd + 6));
 422                /* Copy 8 mbuf pointers to rx_pkts */
 423                _mm256_storeu_si256((void *)rx,
 424                                    _mm256_loadu_si256((void *)rxmb));
 425                _mm256_storeu_si256((void *)(rx + 4),
 426                                    _mm256_loadu_si256((void *)(rxmb + 4)));
 427
 428                /*
 429                 * Collect 8 flags (each 32 bits) into one register.
 430                 * 4 shuffles, 3 blends, 1 permute for 8 desc: 1 inst/desc
 431                 */
 432                __m256i flags01 =
 433                        _mm256_shuffle_epi8(cqd01, flags_shuffle_mask);
 434                /*
 435                 * Shuffle above produces 8 x 32-bit flags for 8 descriptors
 436                 * in this order: 0, 0, 0, 0, 1, 1, 1, 1
 437                 * The duplicates in each 128-bit lane simplifies blending
 438                 * below.
 439                 */
 440                __m256i flags23 =
 441                        _mm256_shuffle_epi8(cqd23, flags_shuffle_mask);
 442                __m256i flags45 =
 443                        _mm256_shuffle_epi8(cqd45, flags_shuffle_mask);
 444                __m256i flags67 =
 445                        _mm256_shuffle_epi8(cqd67, flags_shuffle_mask);
 446                /* 1st blend produces flags for desc: 0, 2, 0, 0, 1, 3, 1, 1 */
 447                __m256i flags0_3 = _mm256_blend_epi32(flags01, flags23, 0x22);
 448                /* 2nd blend produces flags for desc: 4, 4, 4, 6, 5, 5, 5, 7 */
 449                __m256i flags4_7 = _mm256_blend_epi32(flags45, flags67, 0x88);
 450                /* 3rd blend produces flags for desc: 0, 2, 4, 6, 1, 3, 5, 7 */
 451                __m256i flags0_7 = _mm256_blend_epi32(flags0_3, flags4_7, 0xcc);
 452                /*
 453                 * Swap to reorder flags in this order: 1, 3, 5, 7, 0, 2, 4, 6
 454                 * This order simplifies blend operations way below that
 455                 * produce 'rearm' data for each mbuf.
 456                 */
 457                flags0_7 = _mm256_permute4x64_epi64(flags0_7,
 458                        (1 << 6) + (0 << 4) + (3 << 2) + 2);
 459
 460                /*
 461                 * Check truncated bits and bail out early on.
 462                 * 6 avx inst, 1 or, 1 if-then-else for 8 desc: 1 inst/desc
 463                 */
 464                __m256i trunc =
 465                        _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 17), 31);
 466                trunc = _mm256_add_epi64(trunc, _mm256_permute4x64_epi64(trunc,
 467                        (1 << 6) + (0 << 4) + (3 << 2) + 2));
 468                /* 0:63 contains 1+3+0+2 and 64:127 contains 5+7+4+6 */
 469                if (_mm256_extract_epi64(trunc, 0) ||
 470                    _mm256_extract_epi64(trunc, 1))
 471                        break;
 472
 473                /*
 474                 * Compute PKT_RX_RSS_HASH.
 475                 * Use 2 shifts and 1 shuffle for 8 desc: 0.375 inst/desc
 476                 * RSS types in byte 0, 4, 8, 12, 16, 20, 24, 28
 477                 * Everything else is zero.
 478                 */
 479                __m256i rss_types =
 480                        _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 10), 28);
 481                /*
 482                 * RSS flags (PKT_RX_RSS_HASH) are in
 483                 * byte 0, 4, 8, 12, 16, 20, 24, 28
 484                 * Everything else is zero.
 485                 */
 486                __m256i rss_flags = _mm256_shuffle_epi8(rss_shuffle, rss_types);
 487
 488                /*
 489                 * Compute CKSUM flags. First build the index and then
 490                 * use it to shuffle csum_shuffle.
 491                 * 20 instructions including const loads: 2.5 inst/desc
 492                 */
 493                /*
 494                 * csum_not_calc (bit 22)
 495                 * csum_not_calc (0) => 0xffffffff
 496                 * csum_not_calc (1) => 0x0
 497                 */
 498                const __m256i zero4 = _mm256_setzero_si256();
 499                const __m256i mask22 = _mm256_set1_epi32(0x400000);
 500                __m256i csum_not_calc = _mm256_cmpeq_epi32(zero4,
 501                        _mm256_and_si256(flags0_7, mask22));
 502                /*
 503                 * (tcp|udp) && !fragment => bit 1
 504                 * tcp = bit 2, udp = bit 1, frag = bit 6
 505                 */
 506                const __m256i mask1 = _mm256_set1_epi32(0x2);
 507                __m256i tcp_udp =
 508                        _mm256_andnot_si256(_mm256_srli_epi32(flags0_7, 5),
 509                                _mm256_or_si256(flags0_7,
 510                                        _mm256_srli_epi32(flags0_7, 1)));
 511                tcp_udp = _mm256_and_si256(tcp_udp, mask1);
 512                /* ipv4 (bit 5) => bit 2 */
 513                const __m256i mask2 = _mm256_set1_epi32(0x4);
 514                __m256i ipv4 = _mm256_and_si256(mask2,
 515                        _mm256_srli_epi32(flags0_7, 3));
 516                /*
 517                 * ipv4_csum_ok (bit 3) => bit 3
 518                 * tcp_udp_csum_ok (bit 0) => bit 0
 519                 * 0x9
 520                 */
 521                const __m256i mask0_3 = _mm256_set1_epi32(0x9);
 522                __m256i csum_idx = _mm256_and_si256(flags0_7, mask0_3);
 523                csum_idx = _mm256_and_si256(csum_not_calc,
 524                        _mm256_or_si256(_mm256_or_si256(csum_idx, ipv4),
 525                                tcp_udp));
 526                __m256i csum_flags =
 527                        _mm256_shuffle_epi8(csum_shuffle, csum_idx);
 528                /* Shift left to restore CKSUM flags. See csum_shuffle. */
 529                csum_flags = _mm256_slli_epi32(csum_flags, 1);
 530                /* Combine csum flags and offload flags: 0.125 inst/desc */
 531                rss_flags = _mm256_or_si256(rss_flags, csum_flags);
 532
 533                /*
 534                 * Collect 8 VLAN IDs and compute vlan_id != 0 on each.
 535                 * 4 shuffles, 3 blends, 1 permute, 1 cmp, 1 sub for 8 desc:
 536                 * 1.25 inst/desc
 537                 */
 538                __m256i vlan01 = _mm256_shuffle_epi8(cqd01, vlan_shuffle_mask);
 539                __m256i vlan23 = _mm256_shuffle_epi8(cqd23, vlan_shuffle_mask);
 540                __m256i vlan45 = _mm256_shuffle_epi8(cqd45, vlan_shuffle_mask);
 541                __m256i vlan67 = _mm256_shuffle_epi8(cqd67, vlan_shuffle_mask);
 542                __m256i vlan0_3 = _mm256_blend_epi32(vlan01, vlan23, 0x22);
 543                __m256i vlan4_7 = _mm256_blend_epi32(vlan45, vlan67, 0x88);
 544                /* desc: 0, 2, 4, 6, 1, 3, 5, 7 */
 545                __m256i vlan0_7 = _mm256_blend_epi32(vlan0_3, vlan4_7, 0xcc);
 546                /* desc: 1, 3, 5, 7, 0, 2, 4, 6 */
 547                vlan0_7 = _mm256_permute4x64_epi64(vlan0_7,
 548                        (1 << 6) + (0 << 4) + (3 << 2) + 2);
 549                /*
 550                 * Compare 0 == vlan_id produces 0xffffffff (-1) if
 551                 * vlan 0 and 0 if vlan non-0. Then subtracting the
 552                 * result from 0 produces 0 - (-1) = 1 for vlan 0, and
 553                 * 0 - 0 = 0 for vlan non-0.
 554                 */
 555                vlan0_7 = _mm256_cmpeq_epi32(zero4, vlan0_7);
 556                /* vlan_id != 0 => 0, vlan_id == 0 => 1 */
 557                vlan0_7 = _mm256_sub_epi32(zero4, vlan0_7);
 558
 559                /*
 560                 * Compute PKT_RX_VLAN and PKT_RX_VLAN_STRIPPED.
 561                 * Use 3 shifts, 1 or,  1 shuffle for 8 desc: 0.625 inst/desc
 562                 * VLAN offload flags in byte 0, 4, 8, 12, 16, 20, 24, 28
 563                 * Everything else is zero.
 564                 */
 565                __m256i vlan_idx =
 566                        _mm256_or_si256(/* vlan_stripped => bit 0 */
 567                                _mm256_srli_epi32(_mm256_slli_epi32(flags0_7,
 568                                        16), 31),
 569                                /* (vlan_id == 0) => bit 1 */
 570                                _mm256_slli_epi32(vlan0_7, 1));
 571                /*
 572                 * The index captures 4 cases.
 573                 * stripped, id = 0   ==> 11b = 3
 574                 * stripped, id != 0  ==> 01b = 1
 575                 * not strip, id == 0 ==> 10b = 2
 576                 * not strip, id != 0 ==> 00b = 0
 577                 */
 578                __m256i vlan_flags = _mm256_permutevar8x32_epi32(vlan_shuffle,
 579                        vlan_idx);
 580                /* Combine vlan and offload flags: 0.125 inst/desc */
 581                rss_flags = _mm256_or_si256(rss_flags, vlan_flags);
 582
 583                /*
 584                 * Compute non-tunnel PTYPEs.
 585                 * 17 inst / 8 desc = 2.125 inst/desc
 586                 */
 587                /* ETHER and ETHER_VLAN */
 588                __m256i vlan_ptype =
 589                        _mm256_permutevar8x32_epi32(vlan_ptype_shuffle,
 590                                vlan_idx);
 591                /* Build the ptype index from flags */
 592                tcp_udp = _mm256_slli_epi32(flags0_7, 29);
 593                tcp_udp = _mm256_slli_epi32(_mm256_srli_epi32(tcp_udp, 30), 2);
 594                __m256i ip4_ip6 =
 595                        _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 26), 30);
 596                __m256i ptype_idx = _mm256_or_si256(tcp_udp, ip4_ip6);
 597                __m256i frag_bit =
 598                        _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 25), 31);
 599                __m256i nonfrag_ptype =
 600                        _mm256_shuffle_epi8(nonfrag_ptype_shuffle, ptype_idx);
 601                __m256i frag_ptype =
 602                        _mm256_shuffle_epi8(frag_ptype_shuffle, ptype_idx);
 603                /*
 604                 * Zero out the unwanted types and combine the remaining bits.
 605                 * The effect is same as selecting non-frag or frag types
 606                 * depending on the frag bit.
 607                 */
 608                nonfrag_ptype = _mm256_and_si256(nonfrag_ptype,
 609                        _mm256_cmpeq_epi32(zero4, frag_bit));
 610                frag_ptype = _mm256_and_si256(frag_ptype,
 611                        _mm256_cmpgt_epi32(frag_bit, zero4));
 612                __m256i ptype = _mm256_or_si256(nonfrag_ptype, frag_ptype);
 613                ptype = _mm256_slli_epi32(ptype, 4);
 614                /*
 615                 * Compute tunnel PTYPEs.
 616                 * 15 inst / 8 desc = 1.875 inst/desc
 617                 */
 618                __m256i tnl_l3_ptype =
 619                        _mm256_shuffle_epi8(tnl_l3_ptype_shuffle, ptype_idx);
 620                tnl_l3_ptype = _mm256_slli_epi32(tnl_l3_ptype, 16);
 621                /*
 622                 * Shift non-tunnel L4 types to make them tunnel types.
 623                 * RTE_PTYPE_L4_TCP << 16 == RTE_PTYPE_INNER_L4_TCP
 624                 */
 625                __m256i tnl_l4_ptype =
 626                        _mm256_slli_epi32(_mm256_and_si256(ptype,
 627                                _mm256_set1_epi32(RTE_PTYPE_L4_MASK)), 16);
 628                __m256i tnl_ptype =
 629                        _mm256_or_si256(tnl_l3_ptype, tnl_l4_ptype);
 630                tnl_ptype = _mm256_or_si256(tnl_ptype,
 631                        _mm256_set1_epi32(RTE_PTYPE_TUNNEL_GRENAT |
 632                                RTE_PTYPE_INNER_L2_ETHER));
 633                /*
 634                 * Select non-tunnel or tunnel types by zeroing out the
 635                 * unwanted ones.
 636                 */
 637                __m256i tnl_flags = _mm256_and_si256(overlay_enabled,
 638                        _mm256_srli_epi32(_mm256_slli_epi32(flags0_7, 2), 31));
 639                tnl_ptype = _mm256_and_si256(tnl_ptype,
 640                        _mm256_sub_epi32(zero4, tnl_flags));
 641                ptype = _mm256_and_si256(ptype,
 642                        _mm256_cmpeq_epi32(zero4, tnl_flags));
 643                /*
 644                 * Combine types and swap to have ptypes in the same order
 645                 * as desc.
 646                 * desc: 0 2 4 6 1 3 5 7
 647                 * 3 inst / 8 desc = 0.375 inst/desc
 648                 */
 649                ptype = _mm256_or_si256(ptype, tnl_ptype);
 650                ptype = _mm256_or_si256(ptype, vlan_ptype);
 651                ptype = _mm256_permute4x64_epi64(ptype,
 652                        (1 << 6) + (0 << 4) + (3 << 2) + 2);
 653
 654                /*
 655                 * Mask packet length.
 656                 * Use 4 ands: 0.5 instructions/desc
 657                 */
 658                cqd01 = _mm256_and_si256(cqd01, mask);
 659                cqd23 = _mm256_and_si256(cqd23, mask);
 660                cqd45 = _mm256_and_si256(cqd45, mask);
 661                cqd67 = _mm256_and_si256(cqd67, mask);
 662                /*
 663                 * Shuffle. Two 16B sets of the mbuf fields.
 664                 * packet_type, pkt_len, data_len, vlan_tci, rss
 665                 */
 666                __m256i rearm01 = _mm256_shuffle_epi8(cqd01, shuffle_mask);
 667                __m256i rearm23 = _mm256_shuffle_epi8(cqd23, shuffle_mask);
 668                __m256i rearm45 = _mm256_shuffle_epi8(cqd45, shuffle_mask);
 669                __m256i rearm67 = _mm256_shuffle_epi8(cqd67, shuffle_mask);
 670
 671                /*
 672                 * Blend in ptypes
 673                 * 4 blends and 3 shuffles for 8 desc: 0.875 inst/desc
 674                 */
 675                rearm01 = _mm256_blend_epi32(rearm01, ptype, 0x11);
 676                rearm23 = _mm256_blend_epi32(rearm23,
 677                        _mm256_shuffle_epi32(ptype, 1), 0x11);
 678                rearm45 = _mm256_blend_epi32(rearm45,
 679                        _mm256_shuffle_epi32(ptype, 2), 0x11);
 680                rearm67 = _mm256_blend_epi32(rearm67,
 681                        _mm256_shuffle_epi32(ptype, 3), 0x11);
 682
 683                /*
 684                 * Move rss_flags into ol_flags in mbuf_init.
 685                 * Use 1 shift and 1 blend for each desc: 2 inst/desc
 686                 */
 687                __m256i mbuf_init4_5 = _mm256_blend_epi32(mbuf_init,
 688                        rss_flags, 0x44);
 689                __m256i mbuf_init2_3 = _mm256_blend_epi32(mbuf_init,
 690                        _mm256_slli_si256(rss_flags, 4), 0x44);
 691                __m256i mbuf_init0_1 = _mm256_blend_epi32(mbuf_init,
 692                        _mm256_slli_si256(rss_flags, 8), 0x44);
 693                __m256i mbuf_init6_7 = _mm256_blend_epi32(mbuf_init,
 694                        _mm256_srli_si256(rss_flags, 4), 0x44);
 695
 696                /*
 697                 * Build rearm, one per desc.
 698                 * 8 blends and 4 permutes: 1.5 inst/desc
 699                 */
 700                __m256i rearm0 = _mm256_blend_epi32(rearm01,
 701                        mbuf_init0_1, 0xf0);
 702                __m256i rearm1 = _mm256_blend_epi32(mbuf_init0_1,
 703                        rearm01, 0xf0);
 704                __m256i rearm2 = _mm256_blend_epi32(rearm23,
 705                        mbuf_init2_3, 0xf0);
 706                __m256i rearm3 = _mm256_blend_epi32(mbuf_init2_3,
 707                        rearm23, 0xf0);
 708                /* Swap upper and lower 64 bits */
 709                rearm0 = _mm256_permute4x64_epi64(rearm0,
 710                        (1 << 6) + (0 << 4) + (3 << 2) + 2);
 711                rearm2 = _mm256_permute4x64_epi64(rearm2,
 712                        (1 << 6) + (0 << 4) + (3 << 2) + 2);
 713                /* Second set of 4 descriptors */
 714                __m256i rearm4 = _mm256_blend_epi32(rearm45,
 715                        mbuf_init4_5, 0xf0);
 716                __m256i rearm5 = _mm256_blend_epi32(mbuf_init4_5,
 717                        rearm45, 0xf0);
 718                __m256i rearm6 = _mm256_blend_epi32(rearm67,
 719                        mbuf_init6_7, 0xf0);
 720                __m256i rearm7 = _mm256_blend_epi32(mbuf_init6_7,
 721                        rearm67, 0xf0);
 722                rearm4 = _mm256_permute4x64_epi64(rearm4,
 723                        (1 << 6) + (0 << 4) + (3 << 2) + 2);
 724                rearm6 = _mm256_permute4x64_epi64(rearm6,
 725                        (1 << 6) + (0 << 4) + (3 << 2) + 2);
 726
 727                /*
 728                 * Write out 32B of mbuf fields.
 729                 * data_off    - off 0  (mbuf_init)
 730                 * refcnt      - 2      (mbuf_init)
 731                 * nb_segs     - 4      (mbuf_init)
 732                 * port        - 6      (mbuf_init)
 733                 * ol_flag     - 8      (from cqd)
 734                 * packet_type - 16     (from cqd)
 735                 * pkt_len     - 20     (from cqd)
 736                 * data_len    - 24     (from cqd)
 737                 * vlan_tci    - 26     (from cqd)
 738                 * rss         - 28     (from cqd)
 739                 */
 740                _mm256_storeu_si256((__m256i *)&rxmb[0]->rearm_data, rearm0);
 741                _mm256_storeu_si256((__m256i *)&rxmb[1]->rearm_data, rearm1);
 742                _mm256_storeu_si256((__m256i *)&rxmb[2]->rearm_data, rearm2);
 743                _mm256_storeu_si256((__m256i *)&rxmb[3]->rearm_data, rearm3);
 744                _mm256_storeu_si256((__m256i *)&rxmb[4]->rearm_data, rearm4);
 745                _mm256_storeu_si256((__m256i *)&rxmb[5]->rearm_data, rearm5);
 746                _mm256_storeu_si256((__m256i *)&rxmb[6]->rearm_data, rearm6);
 747                _mm256_storeu_si256((__m256i *)&rxmb[7]->rearm_data, rearm7);
 748
 749                max_rx -= 8;
 750                cqd += 8;
 751                rx += 8;
 752                rxmb += 8;
 753        }
 754
 755        /*
 756         * Step 3: Slow path to handle a small (<8) number of packets and
 757         * occasional truncated packets.
 758         */
 759        while (max_rx && ((cqd->type_color &
 760                           CQ_DESC_COLOR_MASK_NOSHIFT) != color)) {
 761                if (unlikely(cqd->bytes_written_flags &
 762                             CQ_ENET_RQ_DESC_FLAGS_TRUNCATED)) {
 763                        rte_pktmbuf_free(*rxmb++);
 764                        rte_atomic64_inc(&enic->soft_stats.rx_packet_errors);
 765                } else {
 766                        *rx++ = rx_one(cqd, *rxmb++, enic);
 767                }
 768                cqd++;
 769                max_rx--;
 770        }
 771
 772        /* Number of descriptors visited */
 773        nb_rx = cqd - (struct cq_enet_rq_desc *)(cq->ring.descs) - cq_idx;
 774        if (nb_rx == 0)
 775                return 0;
 776        rqd = ((struct rq_enet_desc *)rq->ring.descs) + cq_idx;
 777        rxmb = rq->mbuf_ring + cq_idx;
 778        cq_idx += nb_rx;
 779        rq->rx_nb_hold += nb_rx;
 780        if (unlikely(cq_idx == cq->ring.desc_count)) {
 781                cq_idx = 0;
 782                cq->last_color ^= CQ_DESC_COLOR_MASK_NOSHIFT;
 783        }
 784        cq->to_clean = cq_idx;
 785
 786        /* Step 4: Restock RQ with new mbufs */
 787        memcpy(rxmb, rq->free_mbufs + ENIC_RX_BURST_MAX - rq->num_free_mbufs,
 788               sizeof(struct rte_mbuf *) * nb_rx);
 789        rq->num_free_mbufs -= nb_rx;
 790        while (nb_rx) {
 791                rqd->address = (*rxmb)->buf_iova + RTE_PKTMBUF_HEADROOM;
 792                nb_rx--;
 793                rqd++;
 794                rxmb++;
 795        }
 796        if (rq->rx_nb_hold > rq->rx_free_thresh) {
 797                rq->posted_index = enic_ring_add(rq->ring.desc_count,
 798                                                 rq->posted_index,
 799                                                 rq->rx_nb_hold);
 800                rq->rx_nb_hold = 0;
 801                rte_wmb();
 802                iowrite32_relaxed(rq->posted_index,
 803                                  &rq->ctrl->posted_index);
 804        }
 805
 806        return rx - rx_pkts;
 807}
 808
 809bool
 810enic_use_vector_rx_handler(struct rte_eth_dev *eth_dev)
 811{
 812        struct enic *enic = pmd_priv(eth_dev);
 813        struct rte_fdir_conf *fconf;
 814
 815        /* User needs to request for the avx2 handler */
 816        if (!enic->enable_avx2_rx)
 817                return false;
 818        /* Do not support scatter Rx */
 819        if (!(enic->rq_count > 0 && enic->rq[0].data_queue_enable == 0))
 820                return false;
 821        /* Do not support fdir/flow */
 822        fconf = &eth_dev->data->dev_conf.fdir_conf;
 823        if (fconf->mode != RTE_FDIR_MODE_NONE)
 824                return false;
 825        if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) &&
 826                        rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256) {
 827                ENICPMD_LOG(DEBUG, " use the non-scatter avx2 Rx handler");
 828                eth_dev->rx_pkt_burst = &enic_noscatter_vec_recv_pkts;
 829                enic->use_noscatter_vec_rx_handler = 1;
 830                return true;
 831        }
 832        return false;
 833}
 834