dpdk/drivers/net/i40e/i40e_rxtx_vec_neon.c
<<
>>
Prefs
   1/* SPDX-License-Identifier: BSD-3-Clause
   2 * Copyright(c) 2010-2015 Intel Corporation.
   3 * Copyright(c) 2016-2018, Linaro Limited.
   4 */
   5
   6#include <stdint.h>
   7#include <ethdev_driver.h>
   8#include <rte_malloc.h>
   9#include <rte_vect.h>
  10
  11#include "base/i40e_prototype.h"
  12#include "base/i40e_type.h"
  13#include "i40e_ethdev.h"
  14#include "i40e_rxtx.h"
  15#include "i40e_rxtx_vec_common.h"
  16
  17
  18#pragma GCC diagnostic ignored "-Wcast-qual"
  19
  20static inline void
  21i40e_rxq_rearm(struct i40e_rx_queue *rxq)
  22{
  23        int i;
  24        uint16_t rx_id;
  25        volatile union i40e_rx_desc *rxdp;
  26        struct i40e_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];
  27        struct rte_mbuf *mb0, *mb1;
  28        uint64x2_t dma_addr0, dma_addr1;
  29        uint64x2_t zero = vdupq_n_u64(0);
  30        uint64_t paddr;
  31
  32        rxdp = rxq->rx_ring + rxq->rxrearm_start;
  33
  34        /* Pull 'n' more MBUFs into the software ring */
  35        if (unlikely(rte_mempool_get_bulk(rxq->mp,
  36                                          (void *)rxep,
  37                                          RTE_I40E_RXQ_REARM_THRESH) < 0)) {
  38                if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >=
  39                    rxq->nb_rx_desc) {
  40                        for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
  41                                rxep[i].mbuf = &rxq->fake_mbuf;
  42                                vst1q_u64((uint64_t *)&rxdp[i].read, zero);
  43                        }
  44                }
  45                rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
  46                        RTE_I40E_RXQ_REARM_THRESH;
  47                return;
  48        }
  49
  50        /* Initialize the mbufs in vector, process 2 mbufs in one loop */
  51        for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH; i += 2, rxep += 2) {
  52                mb0 = rxep[0].mbuf;
  53                mb1 = rxep[1].mbuf;
  54
  55                paddr = mb0->buf_iova + RTE_PKTMBUF_HEADROOM;
  56                dma_addr0 = vdupq_n_u64(paddr);
  57
  58                /* flush desc with pa dma_addr */
  59                vst1q_u64((uint64_t *)&rxdp++->read, dma_addr0);
  60
  61                paddr = mb1->buf_iova + RTE_PKTMBUF_HEADROOM;
  62                dma_addr1 = vdupq_n_u64(paddr);
  63                vst1q_u64((uint64_t *)&rxdp++->read, dma_addr1);
  64        }
  65
  66        rxq->rxrearm_start += RTE_I40E_RXQ_REARM_THRESH;
  67        if (rxq->rxrearm_start >= rxq->nb_rx_desc)
  68                rxq->rxrearm_start = 0;
  69
  70        rxq->rxrearm_nb -= RTE_I40E_RXQ_REARM_THRESH;
  71
  72        rx_id = (uint16_t)((rxq->rxrearm_start == 0) ?
  73                             (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
  74
  75        rte_io_wmb();
  76        /* Update the tail pointer on the NIC */
  77        I40E_PCI_REG_WRITE_RELAXED(rxq->qrx_tail, rx_id);
  78}
  79
  80#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
  81/* NEON version of FDIR mark extraction for 4 32B descriptors at a time */
  82static inline uint32x4_t
  83descs_to_fdir_32b(volatile union i40e_rx_desc *rxdp, struct rte_mbuf **rx_pkt)
  84{
  85        /* 32B descriptors: Load 2nd half of descriptors for FDIR ID data */
  86        uint64x2_t desc0_qw23, desc1_qw23, desc2_qw23, desc3_qw23;
  87        desc0_qw23 = vld1q_u64((uint64_t *)&(rxdp + 0)->wb.qword2);
  88        desc1_qw23 = vld1q_u64((uint64_t *)&(rxdp + 1)->wb.qword2);
  89        desc2_qw23 = vld1q_u64((uint64_t *)&(rxdp + 2)->wb.qword2);
  90        desc3_qw23 = vld1q_u64((uint64_t *)&(rxdp + 3)->wb.qword2);
  91
  92        /* FDIR ID data: move last u32 of each desc to 4 u32 lanes */
  93        uint32x4_t v_unpack_02, v_unpack_13;
  94        v_unpack_02 = vzipq_u32(vreinterpretq_u32_u64(desc0_qw23),
  95                                vreinterpretq_u32_u64(desc2_qw23)).val[1];
  96        v_unpack_13 = vzipq_u32(vreinterpretq_u32_u64(desc1_qw23),
  97                                vreinterpretq_u32_u64(desc3_qw23)).val[1];
  98        uint32x4_t v_fdir_ids = vzipq_u32(v_unpack_02, v_unpack_13).val[1];
  99
 100        /* Extended Status: extract from each lower 32 bits, to u32 lanes */
 101        v_unpack_02 = vzipq_u32(vreinterpretq_u32_u64(desc0_qw23),
 102                                vreinterpretq_u32_u64(desc2_qw23)).val[0];
 103        v_unpack_13 = vzipq_u32(vreinterpretq_u32_u64(desc1_qw23),
 104                                vreinterpretq_u32_u64(desc3_qw23)).val[0];
 105        uint32x4_t v_flt_status = vzipq_u32(v_unpack_02, v_unpack_13).val[0];
 106
 107        /* Shift u32 left and right to "mask away" bits not required.
 108         * Data required is 4:5 (zero based), so left shift by 26 (32-6)
 109         * and then right shift by 30 (32 - 2 bits required).
 110         */
 111        v_flt_status = vshlq_n_u32(v_flt_status, 26);
 112        v_flt_status = vshrq_n_u32(v_flt_status, 30);
 113
 114        /* Generate constant 1 in all u32 lanes */
 115        RTE_BUILD_BUG_ON(I40E_RX_DESC_EXT_STATUS_FLEXBH_FD_ID != 1);
 116        uint32x4_t v_u32_one = vdupq_n_u32(1);
 117
 118        /* Per desc mask, bits set if FDIR ID is valid */
 119        uint32x4_t v_fd_id_mask = vceqq_u32(v_flt_status, v_u32_one);
 120
 121        /* Mask ID data to zero if the FD_ID bit not set in desc */
 122        v_fdir_ids = vandq_u32(v_fdir_ids, v_fd_id_mask);
 123
 124        /* Store data to fdir.hi in mbuf */
 125        rx_pkt[0]->hash.fdir.hi = vgetq_lane_u32(v_fdir_ids, 0);
 126        rx_pkt[1]->hash.fdir.hi = vgetq_lane_u32(v_fdir_ids, 1);
 127        rx_pkt[2]->hash.fdir.hi = vgetq_lane_u32(v_fdir_ids, 2);
 128        rx_pkt[3]->hash.fdir.hi = vgetq_lane_u32(v_fdir_ids, 3);
 129
 130        /* Convert fdir_id_mask into a single bit, then shift as required for
 131         * correct location in the mbuf->olflags
 132         */
 133        RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR_ID != (1 << 13));
 134        v_fd_id_mask = vshrq_n_u32(v_fd_id_mask, 31);
 135        v_fd_id_mask = vshlq_n_u32(v_fd_id_mask, 13);
 136
 137        /* The returned value must be combined into each mbuf. This is already
 138         * being done for RSS and VLAN mbuf olflags, so return bits to OR in.
 139         */
 140        return v_fd_id_mask;
 141}
 142
 143#else /* 32 or 16B FDIR ID handling */
 144
 145/* Handle 16B descriptor FDIR ID flag setting based on FLM(bit11). See scalar driver
 146 * for scalar implementation of the same functionality.
 147 */
 148static inline uint32x4_t
 149descs_to_fdir_16b(uint32x4_t fltstat, uint64x2_t descs[4], struct rte_mbuf **rx_pkt)
 150{
 151        /* Unpack filter-status data from descriptors */
 152        uint32x4_t v_tmp_02 = vzipq_u32(vreinterpretq_u32_u64(descs[0]),
 153                                        vreinterpretq_u32_u64(descs[2])).val[0];
 154        uint32x4_t v_tmp_13 = vzipq_u32(vreinterpretq_u32_u64(descs[1]),
 155                                        vreinterpretq_u32_u64(descs[3])).val[0];
 156        uint32x4_t v_fdir_ids = vzipq_u32(v_tmp_02, v_tmp_13).val[1];
 157
 158        /* Generate 111 and 11 in each u32 lane */
 159        uint32x4_t v_111_mask = vdupq_n_u32(7);
 160        uint32x4_t v_11_mask = vdupq_n_u32(3);
 161
 162        /* Compare and mask away FDIR ID data if bit not set */
 163        uint32x4_t v_u32_bits = vandq_u32(v_111_mask, fltstat);
 164        uint32x4_t v_fdir_id_mask = vceqq_u32(v_u32_bits, v_11_mask);
 165        v_fdir_ids = vandq_u32(v_fdir_id_mask, v_fdir_ids);
 166
 167        /* Store data to fdir.hi in mbuf */
 168        rx_pkt[0]->hash.fdir.hi = vgetq_lane_u32(v_fdir_ids, 0);
 169        rx_pkt[1]->hash.fdir.hi = vgetq_lane_u32(v_fdir_ids, 1);
 170        rx_pkt[2]->hash.fdir.hi = vgetq_lane_u32(v_fdir_ids, 2);
 171        rx_pkt[3]->hash.fdir.hi = vgetq_lane_u32(v_fdir_ids, 3);
 172
 173        /* Top lane ones mask for FDIR isolation */
 174        uint32x4_t v_desc_fdir_mask = {0, UINT32_MAX, 0, 0};
 175
 176        /* Move fdir_id_mask to correct lane, zero RSS in mbuf if fdir hits */
 177        uint32x4_t v_zeros = {0, 0, 0, 0};
 178        uint32x4_t v_desc3_shift = vextq_u32(v_fdir_id_mask, v_zeros, 2);
 179        uint32x4_t v_desc3_mask = vandq_u32(v_desc_fdir_mask, v_desc3_shift);
 180        descs[3] = vreinterpretq_u64_u32(vbslq_u32(v_desc3_mask, v_zeros,
 181                                vreinterpretq_u32_u64(descs[3])));
 182
 183        uint32x4_t v_desc2_shift = vextq_u32(v_fdir_id_mask, v_zeros, 1);
 184        uint32x4_t v_desc2_mask = vandq_u32(v_desc_fdir_mask, v_desc2_shift);
 185        descs[2] = vreinterpretq_u64_u32(vbslq_u32(v_desc2_mask, v_zeros,
 186                                vreinterpretq_u32_u64(descs[2])));
 187
 188        uint32x4_t v_desc1_shift = v_fdir_id_mask;
 189        uint32x4_t v_desc1_mask = vandq_u32(v_desc_fdir_mask, v_desc1_shift);
 190        descs[1] = vreinterpretq_u64_u32(vbslq_u32(v_desc1_mask, v_zeros,
 191                                vreinterpretq_u32_u64(descs[1])));
 192
 193        uint32x4_t v_desc0_shift = vextq_u32(v_zeros, v_fdir_id_mask, 3);
 194        uint32x4_t v_desc0_mask = vandq_u32(v_desc_fdir_mask, v_desc0_shift);
 195        descs[0] = vreinterpretq_u64_u32(vbslq_u32(v_desc0_mask, v_zeros,
 196                                vreinterpretq_u32_u64(descs[0])));
 197
 198        /* Shift to 1 or 0 bit per u32 lane, then to RTE_MBUF_F_RX_FDIR_ID offset */
 199        RTE_BUILD_BUG_ON(RTE_MBUF_F_RX_FDIR_ID != (1 << 13));
 200        uint32x4_t v_mask_one_bit = vshrq_n_u32(v_fdir_id_mask, 31);
 201        return vshlq_n_u32(v_mask_one_bit, 13);
 202}
 203#endif
 204
 205static inline void
 206desc_to_olflags_v(struct i40e_rx_queue *rxq, volatile union i40e_rx_desc *rxdp,
 207                  uint64x2_t descs[4], struct rte_mbuf **rx_pkts)
 208{
 209        uint32x4_t vlan0, vlan1, rss, l3_l4e;
 210        const uint64x2_t mbuf_init = {rxq->mbuf_initializer, 0};
 211        uint64x2_t rearm0, rearm1, rearm2, rearm3;
 212
 213        /* mask everything except RSS, flow director and VLAN flags
 214         * bit2 is for VLAN tag, bit11 for flow director indication
 215         * bit13:12 for RSS indication.
 216         */
 217        const uint32x4_t rss_vlan_msk = {
 218                        0x1c03804, 0x1c03804, 0x1c03804, 0x1c03804};
 219
 220        const uint32x4_t cksum_mask = {
 221                        RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD |
 222                        RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD |
 223                        RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD,
 224                        RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD |
 225                        RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD |
 226                        RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD,
 227                        RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD |
 228                        RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD |
 229                        RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD,
 230                        RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_IP_CKSUM_BAD |
 231                        RTE_MBUF_F_RX_L4_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD |
 232                        RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD};
 233
 234        /* map rss and vlan type to rss hash and vlan flag */
 235        const uint8x16_t vlan_flags = {
 236                        0, 0, 0, 0,
 237                        RTE_MBUF_F_RX_VLAN | RTE_MBUF_F_RX_VLAN_STRIPPED, 0, 0, 0,
 238                        0, 0, 0, 0,
 239                        0, 0, 0, 0};
 240
 241        const uint8x16_t rss_flags = {
 242                        0, RTE_MBUF_F_RX_FDIR, 0, 0,
 243                        0, 0, RTE_MBUF_F_RX_RSS_HASH, RTE_MBUF_F_RX_RSS_HASH | RTE_MBUF_F_RX_FDIR,
 244                        0, 0, 0, 0,
 245                        0, 0, 0, 0};
 246
 247        const uint8x16_t l3_l4e_flags = {
 248                        (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_GOOD) >> 1,
 249                        RTE_MBUF_F_RX_IP_CKSUM_BAD >> 1,
 250                        (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_L4_CKSUM_BAD) >> 1,
 251                        (RTE_MBUF_F_RX_L4_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
 252                        (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD) >> 1,
 253                        (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
 254                        (RTE_MBUF_F_RX_IP_CKSUM_GOOD | RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD |
 255                         RTE_MBUF_F_RX_L4_CKSUM_BAD) >> 1,
 256                        (RTE_MBUF_F_RX_OUTER_IP_CKSUM_BAD | RTE_MBUF_F_RX_L4_CKSUM_BAD |
 257                         RTE_MBUF_F_RX_IP_CKSUM_BAD) >> 1,
 258                        0, 0, 0, 0, 0, 0, 0, 0};
 259
 260        vlan0 = vzipq_u32(vreinterpretq_u32_u64(descs[0]),
 261                          vreinterpretq_u32_u64(descs[2])).val[1];
 262        vlan1 = vzipq_u32(vreinterpretq_u32_u64(descs[1]),
 263                          vreinterpretq_u32_u64(descs[3])).val[1];
 264        vlan0 = vzipq_u32(vlan0, vlan1).val[0];
 265
 266        vlan1 = vandq_u32(vlan0, rss_vlan_msk);
 267        vlan0 = vreinterpretq_u32_u8(vqtbl1q_u8(vlan_flags,
 268                                                vreinterpretq_u8_u32(vlan1)));
 269
 270        const uint32x4_t desc_fltstat = vshrq_n_u32(vlan1, 11);
 271        rss = vreinterpretq_u32_u8(vqtbl1q_u8(rss_flags,
 272                                              vreinterpretq_u8_u32(desc_fltstat)));
 273
 274        l3_l4e = vshrq_n_u32(vlan1, 22);
 275        l3_l4e = vreinterpretq_u32_u8(vqtbl1q_u8(l3_l4e_flags,
 276                                              vreinterpretq_u8_u32(l3_l4e)));
 277        /* then we shift left 1 bit */
 278        l3_l4e = vshlq_n_u32(l3_l4e, 1);
 279        /* we need to mask out the redundant bits */
 280        l3_l4e = vandq_u32(l3_l4e, cksum_mask);
 281
 282        vlan0 = vorrq_u32(vlan0, rss);
 283        vlan0 = vorrq_u32(vlan0, l3_l4e);
 284
 285        /* Extract FDIR ID only if FDIR is enabled to avoid useless work */
 286        if (rxq->fdir_enabled) {
 287#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
 288                uint32x4_t v_fdir_ol_flags = descs_to_fdir_32b(rxdp, rx_pkts);
 289#else
 290                (void)rxdp; /* rxdp not required for 16B desc mode */
 291                uint32x4_t v_fdir_ol_flags = descs_to_fdir_16b(desc_fltstat, descs, rx_pkts);
 292#endif
 293                /* OR in ol_flag bits after descriptor specific extraction */
 294                vlan0 = vorrq_u32(vlan0, v_fdir_ol_flags);
 295        }
 296
 297        rearm0 = vsetq_lane_u64(vgetq_lane_u32(vlan0, 0), mbuf_init, 1);
 298        rearm1 = vsetq_lane_u64(vgetq_lane_u32(vlan0, 1), mbuf_init, 1);
 299        rearm2 = vsetq_lane_u64(vgetq_lane_u32(vlan0, 2), mbuf_init, 1);
 300        rearm3 = vsetq_lane_u64(vgetq_lane_u32(vlan0, 3), mbuf_init, 1);
 301
 302        vst1q_u64((uint64_t *)&rx_pkts[0]->rearm_data, rearm0);
 303        vst1q_u64((uint64_t *)&rx_pkts[1]->rearm_data, rearm1);
 304        vst1q_u64((uint64_t *)&rx_pkts[2]->rearm_data, rearm2);
 305        vst1q_u64((uint64_t *)&rx_pkts[3]->rearm_data, rearm3);
 306}
 307
 308#define PKTLEN_SHIFT     10
 309#define I40E_UINT16_BIT (CHAR_BIT * sizeof(uint16_t))
 310
 311static inline void
 312desc_to_ptype_v(uint64x2_t descs[4], struct rte_mbuf **__rte_restrict rx_pkts,
 313                uint32_t *__rte_restrict ptype_tbl)
 314{
 315        int i;
 316        uint8_t ptype;
 317        uint8x16_t tmp;
 318
 319        for (i = 0; i < 4; i++) {
 320                tmp = vreinterpretq_u8_u64(vshrq_n_u64(descs[i], 30));
 321                ptype = vgetq_lane_u8(tmp, 8);
 322                rx_pkts[i]->packet_type = ptype_tbl[ptype];
 323        }
 324
 325}
 326
 327/**
 328 * vPMD raw receive routine, only accept(nb_pkts >= RTE_I40E_DESCS_PER_LOOP)
 329 *
 330 * Notice:
 331 * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
 332 * - floor align nb_pkts to a RTE_I40E_DESCS_PER_LOOP power-of-two
 333 */
 334static inline uint16_t
 335_recv_raw_pkts_vec(struct i40e_rx_queue *__rte_restrict rxq,
 336                   struct rte_mbuf **__rte_restrict rx_pkts,
 337                   uint16_t nb_pkts, uint8_t *split_packet)
 338{
 339        volatile union i40e_rx_desc *rxdp;
 340        struct i40e_rx_entry *sw_ring;
 341        uint16_t nb_pkts_recd;
 342        int pos;
 343        uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 344
 345        /* mask to shuffle from desc. to mbuf */
 346        uint8x16_t shuf_msk = {
 347                0xFF, 0xFF,   /* pkt_type set as unknown */
 348                0xFF, 0xFF,   /* pkt_type set as unknown */
 349                14, 15,       /* octet 15~14, low 16 bits pkt_len */
 350                0xFF, 0xFF,   /* skip high 16 bits pkt_len, zero out */
 351                14, 15,       /* octet 15~14, 16 bits data_len */
 352                2, 3,         /* octet 2~3, low 16 bits vlan_macip */
 353                4, 5, 6, 7    /* octet 4~7, 32bits rss */
 354                };
 355
 356        uint8x16_t eop_check = {
 357                0x02, 0x00, 0x02, 0x00,
 358                0x02, 0x00, 0x02, 0x00,
 359                0x00, 0x00, 0x00, 0x00,
 360                0x00, 0x00, 0x00, 0x00
 361                };
 362
 363        uint16x8_t crc_adjust = {
 364                0, 0,         /* ignore pkt_type field */
 365                rxq->crc_len, /* sub crc on pkt_len */
 366                0,            /* ignore high-16bits of pkt_len */
 367                rxq->crc_len, /* sub crc on data_len */
 368                0, 0, 0       /* ignore non-length fields */
 369                };
 370
 371        /* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */
 372        nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP);
 373
 374        /* Just the act of getting into the function from the application is
 375         * going to cost about 7 cycles
 376         */
 377        rxdp = rxq->rx_ring + rxq->rx_tail;
 378
 379        rte_prefetch_non_temporal(rxdp);
 380
 381        /* See if we need to rearm the RX queue - gives the prefetch a bit
 382         * of time to act
 383         */
 384        if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)
 385                i40e_rxq_rearm(rxq);
 386
 387        /* Before we start moving massive data around, check to see if
 388         * there is actually a packet available
 389         */
 390        if (!(rxdp->wb.qword1.status_error_len &
 391                        rte_cpu_to_le_32(1 << I40E_RX_DESC_STATUS_DD_SHIFT)))
 392                return 0;
 393
 394        /* Cache is empty -> need to scan the buffer rings, but first move
 395         * the next 'n' mbufs into the cache
 396         */
 397        sw_ring = &rxq->sw_ring[rxq->rx_tail];
 398
 399        /* A. load 4 packet in one loop
 400         * [A*. mask out 4 unused dirty field in desc]
 401         * B. copy 4 mbuf point from swring to rx_pkts
 402         * C. calc the number of DD bits among the 4 packets
 403         * [C*. extract the end-of-packet bit, if requested]
 404         * D. fill info. from desc to mbuf
 405         */
 406
 407        for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts;
 408                        pos += RTE_I40E_DESCS_PER_LOOP,
 409                        rxdp += RTE_I40E_DESCS_PER_LOOP) {
 410                uint64x2_t descs[RTE_I40E_DESCS_PER_LOOP];
 411                uint8x16_t pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;
 412                uint16x8x2_t sterr_tmp1, sterr_tmp2;
 413                uint64x2_t mbp1, mbp2;
 414                uint16x8_t staterr;
 415                uint16x8_t tmp;
 416                uint64_t stat;
 417
 418                int32x4_t len_shl = {0, 0, 0, PKTLEN_SHIFT};
 419
 420                /* A.1 load desc[3-0] */
 421                descs[3] =  vld1q_u64((uint64_t *)(rxdp + 3));
 422                descs[2] =  vld1q_u64((uint64_t *)(rxdp + 2));
 423                descs[1] =  vld1q_u64((uint64_t *)(rxdp + 1));
 424                descs[0] =  vld1q_u64((uint64_t *)(rxdp));
 425
 426                /* Use acquire fence to order loads of descriptor qwords */
 427                rte_atomic_thread_fence(__ATOMIC_ACQUIRE);
 428                /* A.2 reload qword0 to make it ordered after qword1 load */
 429                descs[3] = vld1q_lane_u64((uint64_t *)(rxdp + 3), descs[3], 0);
 430                descs[2] = vld1q_lane_u64((uint64_t *)(rxdp + 2), descs[2], 0);
 431                descs[1] = vld1q_lane_u64((uint64_t *)(rxdp + 1), descs[1], 0);
 432                descs[0] = vld1q_lane_u64((uint64_t *)(rxdp), descs[0], 0);
 433
 434                /* B.1 load 4 mbuf point */
 435                mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]);
 436                mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]);
 437
 438                /* B.2 copy 4 mbuf point into rx_pkts  */
 439                vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);
 440                vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);
 441
 442                if (split_packet) {
 443                        rte_mbuf_prefetch_part2(rx_pkts[pos]);
 444                        rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
 445                        rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
 446                        rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
 447                }
 448
 449                /* pkts shift the pktlen field to be 16-bit aligned*/
 450                uint32x4_t len3 = vshlq_u32(vreinterpretq_u32_u64(descs[3]),
 451                                            len_shl);
 452                descs[3] = vreinterpretq_u64_u16(vsetq_lane_u16
 453                                (vgetq_lane_u16(vreinterpretq_u16_u32(len3), 7),
 454                                 vreinterpretq_u16_u64(descs[3]),
 455                                 7));
 456                uint32x4_t len2 = vshlq_u32(vreinterpretq_u32_u64(descs[2]),
 457                                            len_shl);
 458                descs[2] = vreinterpretq_u64_u16(vsetq_lane_u16
 459                                (vgetq_lane_u16(vreinterpretq_u16_u32(len2), 7),
 460                                 vreinterpretq_u16_u64(descs[2]),
 461                                 7));
 462                uint32x4_t len1 = vshlq_u32(vreinterpretq_u32_u64(descs[1]),
 463                                            len_shl);
 464                descs[1] = vreinterpretq_u64_u16(vsetq_lane_u16
 465                                (vgetq_lane_u16(vreinterpretq_u16_u32(len1), 7),
 466                                 vreinterpretq_u16_u64(descs[1]),
 467                                 7));
 468                uint32x4_t len0 = vshlq_u32(vreinterpretq_u32_u64(descs[0]),
 469                                            len_shl);
 470                descs[0] = vreinterpretq_u64_u16(vsetq_lane_u16
 471                                (vgetq_lane_u16(vreinterpretq_u16_u32(len0), 7),
 472                                 vreinterpretq_u16_u64(descs[0]),
 473                                 7));
 474
 475                desc_to_olflags_v(rxq, rxdp, descs, &rx_pkts[pos]);
 476
 477                /* D.1 pkts convert format from desc to pktmbuf */
 478                pkt_mb4 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[3]), shuf_msk);
 479                pkt_mb3 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[2]), shuf_msk);
 480                pkt_mb2 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[1]), shuf_msk);
 481                pkt_mb1 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[0]), shuf_msk);
 482
 483                /* D.2 pkts set in_port/nb_seg and remove crc */
 484                tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb4), crc_adjust);
 485                pkt_mb4 = vreinterpretq_u8_u16(tmp);
 486                tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust);
 487                pkt_mb3 = vreinterpretq_u8_u16(tmp);
 488                tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust);
 489                pkt_mb2 = vreinterpretq_u8_u16(tmp);
 490                tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust);
 491                pkt_mb1 = vreinterpretq_u8_u16(tmp);
 492
 493                /* D.3 copy final data to rx_pkts */
 494                vst1q_u8((void *)&rx_pkts[pos + 3]->rx_descriptor_fields1,
 495                                pkt_mb4);
 496                vst1q_u8((void *)&rx_pkts[pos + 2]->rx_descriptor_fields1,
 497                                pkt_mb3);
 498                vst1q_u8((void *)&rx_pkts[pos + 1]->rx_descriptor_fields1,
 499                                pkt_mb2);
 500                vst1q_u8((void *)&rx_pkts[pos]->rx_descriptor_fields1,
 501                                pkt_mb1);
 502
 503                desc_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl);
 504
 505                if (likely(pos + RTE_I40E_DESCS_PER_LOOP < nb_pkts)) {
 506                        rte_prefetch_non_temporal(rxdp + RTE_I40E_DESCS_PER_LOOP);
 507                }
 508
 509                /* C.1 4=>2 filter staterr info only */
 510                sterr_tmp2 = vzipq_u16(vreinterpretq_u16_u64(descs[1]),
 511                                       vreinterpretq_u16_u64(descs[3]));
 512                sterr_tmp1 = vzipq_u16(vreinterpretq_u16_u64(descs[0]),
 513                                       vreinterpretq_u16_u64(descs[2]));
 514
 515                /* C.2 get 4 pkts staterr value  */
 516                staterr = vzipq_u16(sterr_tmp1.val[1],
 517                                    sterr_tmp2.val[1]).val[0];
 518
 519                /* C* extract and record EOP bit */
 520                if (split_packet) {
 521                        uint8x16_t eop_shuf_mask = {
 522                                        0x00, 0x02, 0x04, 0x06,
 523                                        0xFF, 0xFF, 0xFF, 0xFF,
 524                                        0xFF, 0xFF, 0xFF, 0xFF,
 525                                        0xFF, 0xFF, 0xFF, 0xFF};
 526                        uint8x16_t eop_bits;
 527
 528                        /* and with mask to extract bits, flipping 1-0 */
 529                        eop_bits = vmvnq_u8(vreinterpretq_u8_u16(staterr));
 530                        eop_bits = vandq_u8(eop_bits, eop_check);
 531                        /* the staterr values are not in order, as the count
 532                         * of dd bits doesn't care. However, for end of
 533                         * packet tracking, we do care, so shuffle. This also
 534                         * compresses the 32-bit values to 8-bit
 535                         */
 536                        eop_bits = vqtbl1q_u8(eop_bits, eop_shuf_mask);
 537
 538                        /* store the resulting 32-bit value */
 539                        vst1q_lane_u32((uint32_t *)split_packet,
 540                                       vreinterpretq_u32_u8(eop_bits), 0);
 541                        split_packet += RTE_I40E_DESCS_PER_LOOP;
 542
 543                        /* zero-out next pointers */
 544                        rx_pkts[pos]->next = NULL;
 545                        rx_pkts[pos + 1]->next = NULL;
 546                        rx_pkts[pos + 2]->next = NULL;
 547                        rx_pkts[pos + 3]->next = NULL;
 548                }
 549
 550                staterr = vshlq_n_u16(staterr, I40E_UINT16_BIT - 1);
 551                staterr = vreinterpretq_u16_s16(
 552                                vshrq_n_s16(vreinterpretq_s16_u16(staterr),
 553                                            I40E_UINT16_BIT - 1));
 554                stat = ~vgetq_lane_u64(vreinterpretq_u64_u16(staterr), 0);
 555
 556                /* C.4 calc available number of desc */
 557                if (unlikely(stat == 0)) {
 558                        nb_pkts_recd += RTE_I40E_DESCS_PER_LOOP;
 559                } else {
 560                        nb_pkts_recd += __builtin_ctzl(stat) / I40E_UINT16_BIT;
 561                        break;
 562                }
 563        }
 564
 565        /* Update our internal tail pointer */
 566        rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd);
 567        rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1));
 568        rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd);
 569
 570        return nb_pkts_recd;
 571}
 572
 573 /*
 574 * Notice:
 575 * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
 576 * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
 577 *   numbers of DD bits
 578 */
 579uint16_t
 580i40e_recv_pkts_vec(void *__rte_restrict rx_queue,
 581                struct rte_mbuf **__rte_restrict rx_pkts, uint16_t nb_pkts)
 582{
 583        return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 584}
 585
 586/**
 587 * vPMD receive routine that reassembles single burst of 32 scattered packets
 588 *
 589 * Notice:
 590 * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
 591 */
 592static uint16_t
 593i40e_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 594                              uint16_t nb_pkts)
 595{
 596
 597        struct i40e_rx_queue *rxq = rx_queue;
 598        uint8_t split_flags[RTE_I40E_VPMD_RX_BURST] = {0};
 599
 600        /* get some new buffers */
 601        uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
 602                        split_flags);
 603        if (nb_bufs == 0)
 604                return 0;
 605
 606        /* happy day case, full burst + no packets to be joined */
 607        const uint64_t *split_fl64 = (uint64_t *)split_flags;
 608
 609        if (rxq->pkt_first_seg == NULL &&
 610                        split_fl64[0] == 0 && split_fl64[1] == 0 &&
 611                        split_fl64[2] == 0 && split_fl64[3] == 0)
 612                return nb_bufs;
 613
 614        /* reassemble any packets that need reassembly*/
 615        unsigned i = 0;
 616
 617        if (rxq->pkt_first_seg == NULL) {
 618                /* find the first split flag, and only reassemble then*/
 619                while (i < nb_bufs && !split_flags[i])
 620                        i++;
 621                if (i == nb_bufs)
 622                        return nb_bufs;
 623                rxq->pkt_first_seg = rx_pkts[i];
 624        }
 625        return i + reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i,
 626                &split_flags[i]);
 627}
 628
 629/**
 630 * vPMD receive routine that reassembles scattered packets.
 631 */
 632uint16_t
 633i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 634                             uint16_t nb_pkts)
 635{
 636        uint16_t retval = 0;
 637
 638        while (nb_pkts > RTE_I40E_VPMD_RX_BURST) {
 639                uint16_t burst;
 640
 641                burst = i40e_recv_scattered_burst_vec(rx_queue,
 642                                                      rx_pkts + retval,
 643                                                      RTE_I40E_VPMD_RX_BURST);
 644                retval += burst;
 645                nb_pkts -= burst;
 646                if (burst < RTE_I40E_VPMD_RX_BURST)
 647                        return retval;
 648        }
 649
 650        return retval + i40e_recv_scattered_burst_vec(rx_queue,
 651                                                      rx_pkts + retval,
 652                                                      nb_pkts);
 653}
 654
 655static inline void
 656vtx1(volatile struct i40e_tx_desc *txdp,
 657                struct rte_mbuf *pkt, uint64_t flags)
 658{
 659        uint64_t high_qw = (I40E_TX_DESC_DTYPE_DATA |
 660                        ((uint64_t)flags  << I40E_TXD_QW1_CMD_SHIFT) |
 661                        ((uint64_t)pkt->data_len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT));
 662
 663        uint64x2_t descriptor = {pkt->buf_iova + pkt->data_off, high_qw};
 664        vst1q_u64((uint64_t *)txdp, descriptor);
 665}
 666
 667static inline void
 668vtx(volatile struct i40e_tx_desc *txdp, struct rte_mbuf **pkt,
 669                uint16_t nb_pkts,  uint64_t flags)
 670{
 671        int i;
 672
 673        for (i = 0; i < nb_pkts; ++i, ++txdp, ++pkt)
 674                vtx1(txdp, *pkt, flags);
 675}
 676
 677uint16_t
 678i40e_xmit_fixed_burst_vec(void *__rte_restrict tx_queue,
 679        struct rte_mbuf **__rte_restrict tx_pkts, uint16_t nb_pkts)
 680{
 681        struct i40e_tx_queue *txq = (struct i40e_tx_queue *)tx_queue;
 682        volatile struct i40e_tx_desc *txdp;
 683        struct i40e_tx_entry *txep;
 684        uint16_t n, nb_commit, tx_id;
 685        uint64_t flags = I40E_TD_CMD;
 686        uint64_t rs = I40E_TX_DESC_CMD_RS | I40E_TD_CMD;
 687        int i;
 688
 689        /* cross rx_thresh boundary is not allowed */
 690        nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);
 691
 692        if (txq->nb_tx_free < txq->tx_free_thresh)
 693                i40e_tx_free_bufs(txq);
 694
 695        nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
 696        if (unlikely(nb_pkts == 0))
 697                return 0;
 698
 699        tx_id = txq->tx_tail;
 700        txdp = &txq->tx_ring[tx_id];
 701        txep = &txq->sw_ring[tx_id];
 702
 703        txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
 704
 705        n = (uint16_t)(txq->nb_tx_desc - tx_id);
 706        if (nb_commit >= n) {
 707                tx_backlog_entry(txep, tx_pkts, n);
 708
 709                for (i = 0; i < n - 1; ++i, ++tx_pkts, ++txdp)
 710                        vtx1(txdp, *tx_pkts, flags);
 711
 712                vtx1(txdp, *tx_pkts++, rs);
 713
 714                nb_commit = (uint16_t)(nb_commit - n);
 715
 716                tx_id = 0;
 717                txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
 718
 719                /* avoid reach the end of ring */
 720                txdp = &txq->tx_ring[tx_id];
 721                txep = &txq->sw_ring[tx_id];
 722        }
 723
 724        tx_backlog_entry(txep, tx_pkts, nb_commit);
 725
 726        vtx(txdp, tx_pkts, nb_commit, flags);
 727
 728        tx_id = (uint16_t)(tx_id + nb_commit);
 729        if (tx_id > txq->tx_next_rs) {
 730                txq->tx_ring[txq->tx_next_rs].cmd_type_offset_bsz |=
 731                        rte_cpu_to_le_64(((uint64_t)I40E_TX_DESC_CMD_RS) <<
 732                                                I40E_TXD_QW1_CMD_SHIFT);
 733                txq->tx_next_rs =
 734                        (uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh);
 735        }
 736
 737        txq->tx_tail = tx_id;
 738
 739        rte_io_wmb();
 740        I40E_PCI_REG_WRITE_RELAXED(txq->qtx_tail, tx_id);
 741
 742        return nb_pkts;
 743}
 744
 745void __rte_cold
 746i40e_rx_queue_release_mbufs_vec(struct i40e_rx_queue *rxq)
 747{
 748        _i40e_rx_queue_release_mbufs_vec(rxq);
 749}
 750
 751int __rte_cold
 752i40e_rxq_vec_setup(struct i40e_rx_queue *rxq)
 753{
 754        return i40e_rxq_vec_setup_default(rxq);
 755}
 756
 757int __rte_cold
 758i40e_txq_vec_setup(struct i40e_tx_queue __rte_unused *txq)
 759{
 760        return 0;
 761}
 762
 763int __rte_cold
 764i40e_rx_vec_dev_conf_condition_check(struct rte_eth_dev *dev)
 765{
 766        return i40e_rx_vec_dev_conf_condition_check_default(dev);
 767}
 768