dpdk/drivers/net/i40e/i40e_rxtx_vec_neon.c
<<
>>
Prefs
   1/* SPDX-License-Identifier: BSD-3-Clause
   2 * Copyright(c) 2010-2015 Intel Corporation.
   3 * Copyright(c) 2016-2018, Linaro Limited.
   4 */
   5
   6#include <stdint.h>
   7#include <rte_ethdev_driver.h>
   8#include <rte_malloc.h>
   9#include <rte_vect.h>
  10
  11#include "base/i40e_prototype.h"
  12#include "base/i40e_type.h"
  13#include "i40e_ethdev.h"
  14#include "i40e_rxtx.h"
  15#include "i40e_rxtx_vec_common.h"
  16
  17
  18#pragma GCC diagnostic ignored "-Wcast-qual"
  19
  20static inline void
  21i40e_rxq_rearm(struct i40e_rx_queue *rxq)
  22{
  23        int i;
  24        uint16_t rx_id;
  25        volatile union i40e_rx_desc *rxdp;
  26        struct i40e_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];
  27        struct rte_mbuf *mb0, *mb1;
  28        uint64x2_t dma_addr0, dma_addr1;
  29        uint64x2_t zero = vdupq_n_u64(0);
  30        uint64_t paddr;
  31
  32        rxdp = rxq->rx_ring + rxq->rxrearm_start;
  33
  34        /* Pull 'n' more MBUFs into the software ring */
  35        if (unlikely(rte_mempool_get_bulk(rxq->mp,
  36                                          (void *)rxep,
  37                                          RTE_I40E_RXQ_REARM_THRESH) < 0)) {
  38                if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >=
  39                    rxq->nb_rx_desc) {
  40                        for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
  41                                rxep[i].mbuf = &rxq->fake_mbuf;
  42                                vst1q_u64((uint64_t *)&rxdp[i].read, zero);
  43                        }
  44                }
  45                rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
  46                        RTE_I40E_RXQ_REARM_THRESH;
  47                return;
  48        }
  49
  50        /* Initialize the mbufs in vector, process 2 mbufs in one loop */
  51        for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH; i += 2, rxep += 2) {
  52                mb0 = rxep[0].mbuf;
  53                mb1 = rxep[1].mbuf;
  54
  55                paddr = mb0->buf_iova + RTE_PKTMBUF_HEADROOM;
  56                dma_addr0 = vdupq_n_u64(paddr);
  57
  58                /* flush desc with pa dma_addr */
  59                vst1q_u64((uint64_t *)&rxdp++->read, dma_addr0);
  60
  61                paddr = mb1->buf_iova + RTE_PKTMBUF_HEADROOM;
  62                dma_addr1 = vdupq_n_u64(paddr);
  63                vst1q_u64((uint64_t *)&rxdp++->read, dma_addr1);
  64        }
  65
  66        rxq->rxrearm_start += RTE_I40E_RXQ_REARM_THRESH;
  67        if (rxq->rxrearm_start >= rxq->nb_rx_desc)
  68                rxq->rxrearm_start = 0;
  69
  70        rxq->rxrearm_nb -= RTE_I40E_RXQ_REARM_THRESH;
  71
  72        rx_id = (uint16_t)((rxq->rxrearm_start == 0) ?
  73                             (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1));
  74
  75        rte_io_wmb();
  76        /* Update the tail pointer on the NIC */
  77        I40E_PCI_REG_WRITE_RELAXED(rxq->qrx_tail, rx_id);
  78}
  79
  80static inline void
  81desc_to_olflags_v(struct i40e_rx_queue *rxq, uint64x2_t descs[4],
  82                  struct rte_mbuf **rx_pkts)
  83{
  84        uint32x4_t vlan0, vlan1, rss, l3_l4e;
  85        const uint64x2_t mbuf_init = {rxq->mbuf_initializer, 0};
  86        uint64x2_t rearm0, rearm1, rearm2, rearm3;
  87
  88        /* mask everything except RSS, flow director and VLAN flags
  89         * bit2 is for VLAN tag, bit11 for flow director indication
  90         * bit13:12 for RSS indication.
  91         */
  92        const uint32x4_t rss_vlan_msk = {
  93                        0x1c03804, 0x1c03804, 0x1c03804, 0x1c03804};
  94
  95        const uint32x4_t cksum_mask = {
  96                        PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
  97                        PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
  98                        PKT_RX_EIP_CKSUM_BAD,
  99                        PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
 100                        PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
 101                        PKT_RX_EIP_CKSUM_BAD,
 102                        PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
 103                        PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
 104                        PKT_RX_EIP_CKSUM_BAD,
 105                        PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
 106                        PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
 107                        PKT_RX_EIP_CKSUM_BAD};
 108
 109        /* map rss and vlan type to rss hash and vlan flag */
 110        const uint8x16_t vlan_flags = {
 111                        0, 0, 0, 0,
 112                        PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0, 0, 0,
 113                        0, 0, 0, 0,
 114                        0, 0, 0, 0};
 115
 116        const uint8x16_t rss_flags = {
 117                        0, PKT_RX_FDIR, 0, 0,
 118                        0, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH | PKT_RX_FDIR,
 119                        0, 0, 0, 0,
 120                        0, 0, 0, 0};
 121
 122        const uint8x16_t l3_l4e_flags = {
 123                        (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1,
 124                        PKT_RX_IP_CKSUM_BAD >> 1,
 125                        (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
 126                        (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
 127                        (PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD) >> 1,
 128                        (PKT_RX_EIP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
 129                        (PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD |
 130                         PKT_RX_L4_CKSUM_BAD) >> 1,
 131                        (PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
 132                         PKT_RX_IP_CKSUM_BAD) >> 1,
 133                        0, 0, 0, 0, 0, 0, 0, 0};
 134
 135        vlan0 = vzipq_u32(vreinterpretq_u32_u64(descs[0]),
 136                          vreinterpretq_u32_u64(descs[2])).val[1];
 137        vlan1 = vzipq_u32(vreinterpretq_u32_u64(descs[1]),
 138                          vreinterpretq_u32_u64(descs[3])).val[1];
 139        vlan0 = vzipq_u32(vlan0, vlan1).val[0];
 140
 141        vlan1 = vandq_u32(vlan0, rss_vlan_msk);
 142        vlan0 = vreinterpretq_u32_u8(vqtbl1q_u8(vlan_flags,
 143                                                vreinterpretq_u8_u32(vlan1)));
 144
 145        rss = vshrq_n_u32(vlan1, 11);
 146        rss = vreinterpretq_u32_u8(vqtbl1q_u8(rss_flags,
 147                                              vreinterpretq_u8_u32(rss)));
 148
 149        l3_l4e = vshrq_n_u32(vlan1, 22);
 150        l3_l4e = vreinterpretq_u32_u8(vqtbl1q_u8(l3_l4e_flags,
 151                                              vreinterpretq_u8_u32(l3_l4e)));
 152        /* then we shift left 1 bit */
 153        l3_l4e = vshlq_n_u32(l3_l4e, 1);
 154        /* we need to mask out the reduntant bits */
 155        l3_l4e = vandq_u32(l3_l4e, cksum_mask);
 156
 157        vlan0 = vorrq_u32(vlan0, rss);
 158        vlan0 = vorrq_u32(vlan0, l3_l4e);
 159
 160        rearm0 = vsetq_lane_u64(vgetq_lane_u32(vlan0, 0), mbuf_init, 1);
 161        rearm1 = vsetq_lane_u64(vgetq_lane_u32(vlan0, 1), mbuf_init, 1);
 162        rearm2 = vsetq_lane_u64(vgetq_lane_u32(vlan0, 2), mbuf_init, 1);
 163        rearm3 = vsetq_lane_u64(vgetq_lane_u32(vlan0, 3), mbuf_init, 1);
 164
 165        vst1q_u64((uint64_t *)&rx_pkts[0]->rearm_data, rearm0);
 166        vst1q_u64((uint64_t *)&rx_pkts[1]->rearm_data, rearm1);
 167        vst1q_u64((uint64_t *)&rx_pkts[2]->rearm_data, rearm2);
 168        vst1q_u64((uint64_t *)&rx_pkts[3]->rearm_data, rearm3);
 169}
 170
 171#define PKTLEN_SHIFT     10
 172#define I40E_UINT16_BIT (CHAR_BIT * sizeof(uint16_t))
 173
 174static inline void
 175desc_to_ptype_v(uint64x2_t descs[4], struct rte_mbuf **__rte_restrict rx_pkts,
 176                uint32_t *__rte_restrict ptype_tbl)
 177{
 178        int i;
 179        uint8_t ptype;
 180        uint8x16_t tmp;
 181
 182        for (i = 0; i < 4; i++) {
 183                tmp = vreinterpretq_u8_u64(vshrq_n_u64(descs[i], 30));
 184                ptype = vgetq_lane_u8(tmp, 8);
 185                rx_pkts[i]->packet_type = ptype_tbl[ptype];
 186        }
 187
 188}
 189
 190/**
 191 * vPMD raw receive routine, only accept(nb_pkts >= RTE_I40E_DESCS_PER_LOOP)
 192 *
 193 * Notice:
 194 * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
 195 * - floor align nb_pkts to a RTE_I40E_DESCS_PER_LOOP power-of-two
 196 */
 197static inline uint16_t
 198_recv_raw_pkts_vec(struct i40e_rx_queue *__rte_restrict rxq,
 199                   struct rte_mbuf **__rte_restrict rx_pkts,
 200                   uint16_t nb_pkts, uint8_t *split_packet)
 201{
 202        volatile union i40e_rx_desc *rxdp;
 203        struct i40e_rx_entry *sw_ring;
 204        uint16_t nb_pkts_recd;
 205        int pos;
 206        uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
 207
 208        /* mask to shuffle from desc. to mbuf */
 209        uint8x16_t shuf_msk = {
 210                0xFF, 0xFF,   /* pkt_type set as unknown */
 211                0xFF, 0xFF,   /* pkt_type set as unknown */
 212                14, 15,       /* octet 15~14, low 16 bits pkt_len */
 213                0xFF, 0xFF,   /* skip high 16 bits pkt_len, zero out */
 214                14, 15,       /* octet 15~14, 16 bits data_len */
 215                2, 3,         /* octet 2~3, low 16 bits vlan_macip */
 216                4, 5, 6, 7    /* octet 4~7, 32bits rss */
 217                };
 218
 219        uint8x16_t eop_check = {
 220                0x02, 0x00, 0x02, 0x00,
 221                0x02, 0x00, 0x02, 0x00,
 222                0x00, 0x00, 0x00, 0x00,
 223                0x00, 0x00, 0x00, 0x00
 224                };
 225
 226        uint16x8_t crc_adjust = {
 227                0, 0,         /* ignore pkt_type field */
 228                rxq->crc_len, /* sub crc on pkt_len */
 229                0,            /* ignore high-16bits of pkt_len */
 230                rxq->crc_len, /* sub crc on data_len */
 231                0, 0, 0       /* ignore non-length fields */
 232                };
 233
 234        /* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */
 235        nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP);
 236
 237        /* Just the act of getting into the function from the application is
 238         * going to cost about 7 cycles
 239         */
 240        rxdp = rxq->rx_ring + rxq->rx_tail;
 241
 242        rte_prefetch_non_temporal(rxdp);
 243
 244        /* See if we need to rearm the RX queue - gives the prefetch a bit
 245         * of time to act
 246         */
 247        if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)
 248                i40e_rxq_rearm(rxq);
 249
 250        /* Before we start moving massive data around, check to see if
 251         * there is actually a packet available
 252         */
 253        if (!(rxdp->wb.qword1.status_error_len &
 254                        rte_cpu_to_le_32(1 << I40E_RX_DESC_STATUS_DD_SHIFT)))
 255                return 0;
 256
 257        /* Cache is empty -> need to scan the buffer rings, but first move
 258         * the next 'n' mbufs into the cache
 259         */
 260        sw_ring = &rxq->sw_ring[rxq->rx_tail];
 261
 262        /* A. load 4 packet in one loop
 263         * [A*. mask out 4 unused dirty field in desc]
 264         * B. copy 4 mbuf point from swring to rx_pkts
 265         * C. calc the number of DD bits among the 4 packets
 266         * [C*. extract the end-of-packet bit, if requested]
 267         * D. fill info. from desc to mbuf
 268         */
 269
 270        for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts;
 271                        pos += RTE_I40E_DESCS_PER_LOOP,
 272                        rxdp += RTE_I40E_DESCS_PER_LOOP) {
 273                uint64x2_t descs[RTE_I40E_DESCS_PER_LOOP];
 274                uint8x16_t pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;
 275                uint16x8x2_t sterr_tmp1, sterr_tmp2;
 276                uint64x2_t mbp1, mbp2;
 277                uint16x8_t staterr;
 278                uint16x8_t tmp;
 279                uint64_t stat;
 280
 281                int32x4_t len_shl = {0, 0, 0, PKTLEN_SHIFT};
 282
 283                /* B.1 load 1 mbuf point */
 284                mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]);
 285                /* Read desc statuses backwards to avoid race condition */
 286                /* A.1 load 4 pkts desc */
 287                descs[3] =  vld1q_u64((uint64_t *)(rxdp + 3));
 288
 289                /* B.2 copy 2 mbuf point into rx_pkts  */
 290                vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);
 291
 292                /* B.1 load 1 mbuf point */
 293                mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]);
 294
 295                descs[2] =  vld1q_u64((uint64_t *)(rxdp + 2));
 296                /* B.1 load 2 mbuf point */
 297                descs[1] =  vld1q_u64((uint64_t *)(rxdp + 1));
 298                descs[0] =  vld1q_u64((uint64_t *)(rxdp));
 299
 300                /* B.2 copy 2 mbuf point into rx_pkts  */
 301                vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);
 302
 303                if (split_packet) {
 304                        rte_mbuf_prefetch_part2(rx_pkts[pos]);
 305                        rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
 306                        rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
 307                        rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
 308                }
 309
 310                /* pkt 3,4 shift the pktlen field to be 16-bit aligned*/
 311                uint32x4_t len3 = vshlq_u32(vreinterpretq_u32_u64(descs[3]),
 312                                            len_shl);
 313                descs[3] = vreinterpretq_u64_u32(len3);
 314                uint32x4_t len2 = vshlq_u32(vreinterpretq_u32_u64(descs[2]),
 315                                            len_shl);
 316                descs[2] = vreinterpretq_u64_u32(len2);
 317
 318                /* D.1 pkt 3,4 convert format from desc to pktmbuf */
 319                pkt_mb4 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[3]), shuf_msk);
 320                pkt_mb3 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[2]), shuf_msk);
 321
 322                /* C.1 4=>2 filter staterr info only */
 323                sterr_tmp2 = vzipq_u16(vreinterpretq_u16_u64(descs[1]),
 324                                       vreinterpretq_u16_u64(descs[3]));
 325                /* C.1 4=>2 filter staterr info only */
 326                sterr_tmp1 = vzipq_u16(vreinterpretq_u16_u64(descs[0]),
 327                                       vreinterpretq_u16_u64(descs[2]));
 328
 329                /* C.2 get 4 pkts staterr value  */
 330                staterr = vzipq_u16(sterr_tmp1.val[1],
 331                                    sterr_tmp2.val[1]).val[0];
 332
 333                desc_to_olflags_v(rxq, descs, &rx_pkts[pos]);
 334
 335                /* D.2 pkt 3,4 set in_port/nb_seg and remove crc */
 336                tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb4), crc_adjust);
 337                pkt_mb4 = vreinterpretq_u8_u16(tmp);
 338                tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust);
 339                pkt_mb3 = vreinterpretq_u8_u16(tmp);
 340
 341                /* pkt 1,2 shift the pktlen field to be 16-bit aligned*/
 342                uint32x4_t len1 = vshlq_u32(vreinterpretq_u32_u64(descs[1]),
 343                                            len_shl);
 344                descs[1] = vreinterpretq_u64_u32(len1);
 345                uint32x4_t len0 = vshlq_u32(vreinterpretq_u32_u64(descs[0]),
 346                                            len_shl);
 347                descs[0] = vreinterpretq_u64_u32(len0);
 348
 349                /* D.1 pkt 1,2 convert format from desc to pktmbuf */
 350                pkt_mb2 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[1]), shuf_msk);
 351                pkt_mb1 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[0]), shuf_msk);
 352
 353                /* D.3 copy final 3,4 data to rx_pkts */
 354                vst1q_u8((void *)&rx_pkts[pos + 3]->rx_descriptor_fields1,
 355                                 pkt_mb4);
 356                vst1q_u8((void *)&rx_pkts[pos + 2]->rx_descriptor_fields1,
 357                                 pkt_mb3);
 358
 359                /* D.2 pkt 1,2 set in_port/nb_seg and remove crc */
 360                tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust);
 361                pkt_mb2 = vreinterpretq_u8_u16(tmp);
 362                tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust);
 363                pkt_mb1 = vreinterpretq_u8_u16(tmp);
 364
 365                /* C* extract and record EOP bit */
 366                if (split_packet) {
 367                        uint8x16_t eop_shuf_mask = {
 368                                        0x00, 0x02, 0x04, 0x06,
 369                                        0xFF, 0xFF, 0xFF, 0xFF,
 370                                        0xFF, 0xFF, 0xFF, 0xFF,
 371                                        0xFF, 0xFF, 0xFF, 0xFF};
 372                        uint8x16_t eop_bits;
 373
 374                        /* and with mask to extract bits, flipping 1-0 */
 375                        eop_bits = vmvnq_u8(vreinterpretq_u8_u16(staterr));
 376                        eop_bits = vandq_u8(eop_bits, eop_check);
 377                        /* the staterr values are not in order, as the count
 378                         * count of dd bits doesn't care. However, for end of
 379                         * packet tracking, we do care, so shuffle. This also
 380                         * compresses the 32-bit values to 8-bit
 381                         */
 382                        eop_bits = vqtbl1q_u8(eop_bits, eop_shuf_mask);
 383
 384                        /* store the resulting 32-bit value */
 385                        vst1q_lane_u32((uint32_t *)split_packet,
 386                                       vreinterpretq_u32_u8(eop_bits), 0);
 387                        split_packet += RTE_I40E_DESCS_PER_LOOP;
 388
 389                        /* zero-out next pointers */
 390                        rx_pkts[pos]->next = NULL;
 391                        rx_pkts[pos + 1]->next = NULL;
 392                        rx_pkts[pos + 2]->next = NULL;
 393                        rx_pkts[pos + 3]->next = NULL;
 394                }
 395
 396                staterr = vshlq_n_u16(staterr, I40E_UINT16_BIT - 1);
 397                staterr = vreinterpretq_u16_s16(
 398                                vshrq_n_s16(vreinterpretq_s16_u16(staterr),
 399                                            I40E_UINT16_BIT - 1));
 400                stat = ~vgetq_lane_u64(vreinterpretq_u64_u16(staterr), 0);
 401
 402                rte_prefetch_non_temporal(rxdp + RTE_I40E_DESCS_PER_LOOP);
 403
 404                /* D.3 copy final 1,2 data to rx_pkts */
 405                vst1q_u8((void *)&rx_pkts[pos + 1]->rx_descriptor_fields1,
 406                         pkt_mb2);
 407                vst1q_u8((void *)&rx_pkts[pos]->rx_descriptor_fields1,
 408                         pkt_mb1);
 409                desc_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl);
 410                /* C.4 calc avaialbe number of desc */
 411                if (unlikely(stat == 0)) {
 412                        nb_pkts_recd += RTE_I40E_DESCS_PER_LOOP;
 413                } else {
 414                        nb_pkts_recd += __builtin_ctzl(stat) / I40E_UINT16_BIT;
 415                        break;
 416                }
 417        }
 418
 419        /* Update our internal tail pointer */
 420        rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd);
 421        rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1));
 422        rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd);
 423
 424        return nb_pkts_recd;
 425}
 426
 427 /*
 428 * Notice:
 429 * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
 430 * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
 431 *   numbers of DD bits
 432 */
 433uint16_t
 434i40e_recv_pkts_vec(void *__rte_restrict rx_queue,
 435                struct rte_mbuf **__rte_restrict rx_pkts, uint16_t nb_pkts)
 436{
 437        return _recv_raw_pkts_vec(rx_queue, rx_pkts, nb_pkts, NULL);
 438}
 439
 440/**
 441 * vPMD receive routine that reassembles single burst of 32 scattered packets
 442 *
 443 * Notice:
 444 * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
 445 */
 446static uint16_t
 447i40e_recv_scattered_burst_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 448                              uint16_t nb_pkts)
 449{
 450
 451        struct i40e_rx_queue *rxq = rx_queue;
 452        uint8_t split_flags[RTE_I40E_VPMD_RX_BURST] = {0};
 453
 454        /* get some new buffers */
 455        uint16_t nb_bufs = _recv_raw_pkts_vec(rxq, rx_pkts, nb_pkts,
 456                        split_flags);
 457        if (nb_bufs == 0)
 458                return 0;
 459
 460        /* happy day case, full burst + no packets to be joined */
 461        const uint64_t *split_fl64 = (uint64_t *)split_flags;
 462
 463        if (rxq->pkt_first_seg == NULL &&
 464                        split_fl64[0] == 0 && split_fl64[1] == 0 &&
 465                        split_fl64[2] == 0 && split_fl64[3] == 0)
 466                return nb_bufs;
 467
 468        /* reassemble any packets that need reassembly*/
 469        unsigned i = 0;
 470
 471        if (rxq->pkt_first_seg == NULL) {
 472                /* find the first split flag, and only reassemble then*/
 473                while (i < nb_bufs && !split_flags[i])
 474                        i++;
 475                if (i == nb_bufs)
 476                        return nb_bufs;
 477                rxq->pkt_first_seg = rx_pkts[i];
 478        }
 479        return i + reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i,
 480                &split_flags[i]);
 481}
 482
 483/**
 484 * vPMD receive routine that reassembles scattered packets.
 485 */
 486uint16_t
 487i40e_recv_scattered_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 488                             uint16_t nb_pkts)
 489{
 490        uint16_t retval = 0;
 491
 492        while (nb_pkts > RTE_I40E_VPMD_RX_BURST) {
 493                uint16_t burst;
 494
 495                burst = i40e_recv_scattered_burst_vec(rx_queue,
 496                                                      rx_pkts + retval,
 497                                                      RTE_I40E_VPMD_RX_BURST);
 498                retval += burst;
 499                nb_pkts -= burst;
 500                if (burst < RTE_I40E_VPMD_RX_BURST)
 501                        return retval;
 502        }
 503
 504        return retval + i40e_recv_scattered_burst_vec(rx_queue,
 505                                                      rx_pkts + retval,
 506                                                      nb_pkts);
 507}
 508
 509static inline void
 510vtx1(volatile struct i40e_tx_desc *txdp,
 511                struct rte_mbuf *pkt, uint64_t flags)
 512{
 513        uint64_t high_qw = (I40E_TX_DESC_DTYPE_DATA |
 514                        ((uint64_t)flags  << I40E_TXD_QW1_CMD_SHIFT) |
 515                        ((uint64_t)pkt->data_len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT));
 516
 517        uint64x2_t descriptor = {pkt->buf_iova + pkt->data_off, high_qw};
 518        vst1q_u64((uint64_t *)txdp, descriptor);
 519}
 520
 521static inline void
 522vtx(volatile struct i40e_tx_desc *txdp, struct rte_mbuf **pkt,
 523                uint16_t nb_pkts,  uint64_t flags)
 524{
 525        int i;
 526
 527        for (i = 0; i < nb_pkts; ++i, ++txdp, ++pkt)
 528                vtx1(txdp, *pkt, flags);
 529}
 530
 531uint16_t
 532i40e_xmit_fixed_burst_vec(void *__rte_restrict tx_queue,
 533        struct rte_mbuf **__rte_restrict tx_pkts, uint16_t nb_pkts)
 534{
 535        struct i40e_tx_queue *txq = (struct i40e_tx_queue *)tx_queue;
 536        volatile struct i40e_tx_desc *txdp;
 537        struct i40e_tx_entry *txep;
 538        uint16_t n, nb_commit, tx_id;
 539        uint64_t flags = I40E_TD_CMD;
 540        uint64_t rs = I40E_TX_DESC_CMD_RS | I40E_TD_CMD;
 541        int i;
 542
 543        /* cross rx_thresh boundary is not allowed */
 544        nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);
 545
 546        if (txq->nb_tx_free < txq->tx_free_thresh)
 547                i40e_tx_free_bufs(txq);
 548
 549        nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
 550        if (unlikely(nb_pkts == 0))
 551                return 0;
 552
 553        tx_id = txq->tx_tail;
 554        txdp = &txq->tx_ring[tx_id];
 555        txep = &txq->sw_ring[tx_id];
 556
 557        txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
 558
 559        n = (uint16_t)(txq->nb_tx_desc - tx_id);
 560        if (nb_commit >= n) {
 561                tx_backlog_entry(txep, tx_pkts, n);
 562
 563                for (i = 0; i < n - 1; ++i, ++tx_pkts, ++txdp)
 564                        vtx1(txdp, *tx_pkts, flags);
 565
 566                vtx1(txdp, *tx_pkts++, rs);
 567
 568                nb_commit = (uint16_t)(nb_commit - n);
 569
 570                tx_id = 0;
 571                txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
 572
 573                /* avoid reach the end of ring */
 574                txdp = &txq->tx_ring[tx_id];
 575                txep = &txq->sw_ring[tx_id];
 576        }
 577
 578        tx_backlog_entry(txep, tx_pkts, nb_commit);
 579
 580        vtx(txdp, tx_pkts, nb_commit, flags);
 581
 582        tx_id = (uint16_t)(tx_id + nb_commit);
 583        if (tx_id > txq->tx_next_rs) {
 584                txq->tx_ring[txq->tx_next_rs].cmd_type_offset_bsz |=
 585                        rte_cpu_to_le_64(((uint64_t)I40E_TX_DESC_CMD_RS) <<
 586                                                I40E_TXD_QW1_CMD_SHIFT);
 587                txq->tx_next_rs =
 588                        (uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh);
 589        }
 590
 591        txq->tx_tail = tx_id;
 592
 593        rte_io_wmb();
 594        I40E_PCI_REG_WRITE_RELAXED(txq->qtx_tail, tx_id);
 595
 596        return nb_pkts;
 597}
 598
 599void __rte_cold
 600i40e_rx_queue_release_mbufs_vec(struct i40e_rx_queue *rxq)
 601{
 602        _i40e_rx_queue_release_mbufs_vec(rxq);
 603}
 604
 605int __rte_cold
 606i40e_rxq_vec_setup(struct i40e_rx_queue *rxq)
 607{
 608        return i40e_rxq_vec_setup_default(rxq);
 609}
 610
 611int __rte_cold
 612i40e_txq_vec_setup(struct i40e_tx_queue __rte_unused *txq)
 613{
 614        return 0;
 615}
 616
 617int __rte_cold
 618i40e_rx_vec_dev_conf_condition_check(struct rte_eth_dev *dev)
 619{
 620        return i40e_rx_vec_dev_conf_condition_check_default(dev);
 621}
 622