dpdk/drivers/net/hns3/hns3_rxtx_vec_neon.h
<<
>>
Prefs
   1/* SPDX-License-Identifier: BSD-3-Clause
   2 * Copyright(c) 2020-2021 HiSilicon Limited.
   3 */
   4
   5#ifndef _HNS3_RXTX_VEC_NEON_H_
   6#define _HNS3_RXTX_VEC_NEON_H_
   7
   8#include <arm_neon.h>
   9
  10#pragma GCC diagnostic ignored "-Wcast-qual"
  11
  12static inline void
  13hns3_vec_tx(volatile struct hns3_desc *desc, struct rte_mbuf *pkt)
  14{
  15        uint64x2_t val1 = {
  16                pkt->buf_iova + pkt->data_off,
  17                ((uint64_t)pkt->data_len) << HNS3_TXD_SEND_SIZE_SHIFT
  18        };
  19        uint64x2_t val2 = {
  20                0,
  21                ((uint64_t)HNS3_TXD_DEFAULT_VLD_FE_BDTYPE) << HNS3_UINT32_BIT
  22        };
  23        vst1q_u64((uint64_t *)&desc->addr, val1);
  24        vst1q_u64((uint64_t *)&desc->tx.outer_vlan_tag, val2);
  25}
  26
  27static uint16_t
  28hns3_xmit_fixed_burst_vec(void *__restrict tx_queue,
  29                          struct rte_mbuf **__restrict tx_pkts,
  30                          uint16_t nb_pkts)
  31{
  32        struct hns3_tx_queue *txq = (struct hns3_tx_queue *)tx_queue;
  33        volatile struct hns3_desc *tx_desc;
  34        struct hns3_entry *tx_entry;
  35        uint16_t next_to_use;
  36        uint16_t nb_commit;
  37        uint16_t nb_tx;
  38        uint16_t n, i;
  39
  40        if (txq->tx_bd_ready < txq->tx_free_thresh)
  41                hns3_tx_free_buffers(txq);
  42
  43        nb_commit = RTE_MIN(txq->tx_bd_ready, nb_pkts);
  44        if (unlikely(nb_commit == 0)) {
  45                txq->dfx_stats.queue_full_cnt++;
  46                return 0;
  47        }
  48        nb_tx = nb_commit;
  49
  50        next_to_use = txq->next_to_use;
  51        tx_desc = &txq->tx_ring[next_to_use];
  52        tx_entry = &txq->sw_ring[next_to_use];
  53
  54        /*
  55         * We need to deal with n descriptors first for better performance,
  56         * if nb_commit is greater than the difference between txq->nb_tx_desc
  57         * and next_to_use in sw_ring and tx_ring.
  58         */
  59        n = txq->nb_tx_desc - next_to_use;
  60        if (nb_commit >= n) {
  61                for (i = 0; i < n; i++, tx_pkts++, tx_desc++) {
  62                        hns3_vec_tx(tx_desc, *tx_pkts);
  63                        tx_entry[i].mbuf = *tx_pkts;
  64
  65                        /* Increment bytes counter */
  66                        txq->basic_stats.bytes += (*tx_pkts)->pkt_len;
  67                }
  68
  69                nb_commit -= n;
  70                next_to_use = 0;
  71                tx_desc = &txq->tx_ring[next_to_use];
  72                tx_entry = &txq->sw_ring[next_to_use];
  73        }
  74
  75        for (i = 0; i < nb_commit; i++, tx_pkts++, tx_desc++) {
  76                hns3_vec_tx(tx_desc, *tx_pkts);
  77                tx_entry[i].mbuf = *tx_pkts;
  78
  79                /* Increment bytes counter */
  80                txq->basic_stats.bytes += (*tx_pkts)->pkt_len;
  81        }
  82
  83        next_to_use += nb_commit;
  84        txq->next_to_use = next_to_use;
  85        txq->tx_bd_ready -= nb_tx;
  86
  87        hns3_write_txq_tail_reg(txq, nb_tx);
  88
  89        return nb_tx;
  90}
  91
  92static inline uint32_t
  93hns3_desc_parse_field(struct hns3_rx_queue *rxq,
  94                      struct hns3_entry *sw_ring,
  95                      struct hns3_desc *rxdp,
  96                      uint32_t   bd_vld_num)
  97{
  98        uint32_t l234_info, ol_info, bd_base_info;
  99        struct rte_mbuf *pkt;
 100        uint32_t retcode = 0;
 101        uint32_t i;
 102        int ret;
 103
 104        for (i = 0; i < bd_vld_num; i++) {
 105                pkt = sw_ring[i].mbuf;
 106
 107                /* init rte_mbuf.rearm_data last 64-bit */
 108                pkt->ol_flags = PKT_RX_RSS_HASH;
 109
 110                l234_info = rxdp[i].rx.l234_info;
 111                ol_info = rxdp[i].rx.ol_info;
 112                bd_base_info = rxdp[i].rx.bd_base_info;
 113                ret = hns3_handle_bdinfo(rxq, pkt, bd_base_info, l234_info);
 114                if (unlikely(ret)) {
 115                        retcode |= 1u << i;
 116                        continue;
 117                }
 118
 119                pkt->packet_type = hns3_rx_calc_ptype(rxq, l234_info, ol_info);
 120
 121                /* Increment bytes counter */
 122                rxq->basic_stats.bytes += pkt->pkt_len;
 123        }
 124
 125        return retcode;
 126}
 127
 128static inline uint16_t
 129hns3_recv_burst_vec(struct hns3_rx_queue *__restrict rxq,
 130                    struct rte_mbuf **__restrict rx_pkts,
 131                    uint16_t nb_pkts,
 132                    uint64_t *bd_err_mask)
 133{
 134        uint16_t rx_id = rxq->next_to_use;
 135        struct hns3_entry *sw_ring = &rxq->sw_ring[rx_id];
 136        struct hns3_desc *rxdp = &rxq->rx_ring[rx_id];
 137        uint32_t bd_valid_num, parse_retcode;
 138        uint16_t nb_rx = 0;
 139        uint32_t pos;
 140        int offset;
 141
 142        /* mask to shuffle from desc to mbuf's rx_descriptor_fields1 */
 143        uint8x16_t shuf_desc_fields_msk = {
 144                0xff, 0xff, 0xff, 0xff,  /* packet type init zero */
 145                22, 23, 0xff, 0xff,      /* rx.pkt_len to rte_mbuf.pkt_len */
 146                20, 21,                  /* size to rte_mbuf.data_len */
 147                0xff, 0xff,              /* rte_mbuf.vlan_tci init zero */
 148                8, 9, 10, 11,            /* rx.rss_hash to rte_mbuf.hash.rss */
 149        };
 150
 151        uint16x8_t crc_adjust = {
 152                0, 0,         /* ignore pkt_type field */
 153                rxq->crc_len, /* sub crc on pkt_len */
 154                0,            /* ignore high-16bits of pkt_len */
 155                rxq->crc_len, /* sub crc on data_len */
 156                0, 0, 0,      /* ignore non-length fields */
 157        };
 158
 159        /* compile-time verifies the shuffle mask */
 160        RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
 161                         offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
 162        RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
 163                         offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
 164        RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash.rss) !=
 165                         offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
 166
 167        for (pos = 0; pos < nb_pkts; pos += HNS3_DEFAULT_DESCS_PER_LOOP,
 168                                     rxdp += HNS3_DEFAULT_DESCS_PER_LOOP) {
 169                uint64x2x2_t descs[HNS3_DEFAULT_DESCS_PER_LOOP];
 170                uint8x16x2_t pkt_mbuf1, pkt_mbuf2, pkt_mbuf3, pkt_mbuf4;
 171                uint8x16_t pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;
 172                uint64x2_t mbp1, mbp2;
 173                uint16x4_t bd_vld = {0};
 174                uint16x8_t tmp;
 175                uint64_t stat;
 176
 177                /* calc how many bd valid */
 178                bd_vld = vset_lane_u16(rxdp[0].rx.bdtype_vld_udp0, bd_vld, 0);
 179                bd_vld = vset_lane_u16(rxdp[1].rx.bdtype_vld_udp0, bd_vld, 1);
 180                bd_vld = vset_lane_u16(rxdp[2].rx.bdtype_vld_udp0, bd_vld, 2);
 181                bd_vld = vset_lane_u16(rxdp[3].rx.bdtype_vld_udp0, bd_vld, 3);
 182
 183                /* load 2 mbuf pointer */
 184                mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]);
 185
 186                bd_vld = vshl_n_u16(bd_vld,
 187                                    HNS3_UINT16_BIT - 1 - HNS3_RXD_VLD_B);
 188                bd_vld = vreinterpret_u16_s16(
 189                                vshr_n_s16(vreinterpret_s16_u16(bd_vld),
 190                                           HNS3_UINT16_BIT - 1));
 191                stat = ~vget_lane_u64(vreinterpret_u64_u16(bd_vld), 0);
 192
 193                /* load 2 mbuf pointer again */
 194                mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]);
 195
 196                if (likely(stat == 0))
 197                        bd_valid_num = HNS3_DEFAULT_DESCS_PER_LOOP;
 198                else
 199                        bd_valid_num = __builtin_ctzl(stat) / HNS3_UINT16_BIT;
 200                if (bd_valid_num == 0)
 201                        break;
 202
 203                /* use offset to control below data load oper ordering */
 204                offset = rxq->offset_table[bd_valid_num];
 205
 206                /* store 2 mbuf pointer into rx_pkts */
 207                vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);
 208
 209                /* read first two descs */
 210                descs[0] = vld2q_u64((uint64_t *)(rxdp + offset));
 211                descs[1] = vld2q_u64((uint64_t *)(rxdp + offset + 1));
 212
 213                /* store 2 mbuf pointer into rx_pkts again */
 214                vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);
 215
 216                /* read remains two descs */
 217                descs[2] = vld2q_u64((uint64_t *)(rxdp + offset + 2));
 218                descs[3] = vld2q_u64((uint64_t *)(rxdp + offset + 3));
 219
 220                pkt_mbuf1.val[0] = vreinterpretq_u8_u64(descs[0].val[0]);
 221                pkt_mbuf1.val[1] = vreinterpretq_u8_u64(descs[0].val[1]);
 222                pkt_mbuf2.val[0] = vreinterpretq_u8_u64(descs[1].val[0]);
 223                pkt_mbuf2.val[1] = vreinterpretq_u8_u64(descs[1].val[1]);
 224
 225                /* pkt 1,2 convert format from desc to pktmbuf */
 226                pkt_mb1 = vqtbl2q_u8(pkt_mbuf1, shuf_desc_fields_msk);
 227                pkt_mb2 = vqtbl2q_u8(pkt_mbuf2, shuf_desc_fields_msk);
 228
 229                /* store the first 8 bytes of pkt 1,2 mbuf's rearm_data */
 230                *(uint64_t *)&sw_ring[pos + 0].mbuf->rearm_data =
 231                        rxq->mbuf_initializer;
 232                *(uint64_t *)&sw_ring[pos + 1].mbuf->rearm_data =
 233                        rxq->mbuf_initializer;
 234
 235                /* pkt 1,2 remove crc */
 236                tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust);
 237                pkt_mb1 = vreinterpretq_u8_u16(tmp);
 238                tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust);
 239                pkt_mb2 = vreinterpretq_u8_u16(tmp);
 240
 241                pkt_mbuf3.val[0] = vreinterpretq_u8_u64(descs[2].val[0]);
 242                pkt_mbuf3.val[1] = vreinterpretq_u8_u64(descs[2].val[1]);
 243                pkt_mbuf4.val[0] = vreinterpretq_u8_u64(descs[3].val[0]);
 244                pkt_mbuf4.val[1] = vreinterpretq_u8_u64(descs[3].val[1]);
 245
 246                /* pkt 3,4 convert format from desc to pktmbuf */
 247                pkt_mb3 = vqtbl2q_u8(pkt_mbuf3, shuf_desc_fields_msk);
 248                pkt_mb4 = vqtbl2q_u8(pkt_mbuf4, shuf_desc_fields_msk);
 249
 250                /* pkt 1,2 save to rx_pkts mbuf */
 251                vst1q_u8((void *)&sw_ring[pos + 0].mbuf->rx_descriptor_fields1,
 252                         pkt_mb1);
 253                vst1q_u8((void *)&sw_ring[pos + 1].mbuf->rx_descriptor_fields1,
 254                         pkt_mb2);
 255
 256                /* pkt 3,4 remove crc */
 257                tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust);
 258                pkt_mb3 = vreinterpretq_u8_u16(tmp);
 259                tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb4), crc_adjust);
 260                pkt_mb4 = vreinterpretq_u8_u16(tmp);
 261
 262                /* store the first 8 bytes of pkt 3,4 mbuf's rearm_data */
 263                *(uint64_t *)&sw_ring[pos + 2].mbuf->rearm_data =
 264                        rxq->mbuf_initializer;
 265                *(uint64_t *)&sw_ring[pos + 3].mbuf->rearm_data =
 266                        rxq->mbuf_initializer;
 267
 268                /* pkt 3,4 save to rx_pkts mbuf */
 269                vst1q_u8((void *)&sw_ring[pos + 2].mbuf->rx_descriptor_fields1,
 270                         pkt_mb3);
 271                vst1q_u8((void *)&sw_ring[pos + 3].mbuf->rx_descriptor_fields1,
 272                         pkt_mb4);
 273
 274                rte_prefetch_non_temporal(rxdp + HNS3_DEFAULT_DESCS_PER_LOOP);
 275
 276                parse_retcode = hns3_desc_parse_field(rxq, &sw_ring[pos],
 277                        &rxdp[offset], bd_valid_num);
 278                if (unlikely(parse_retcode))
 279                        (*bd_err_mask) |= ((uint64_t)parse_retcode) << pos;
 280
 281                rte_prefetch0(sw_ring[pos +
 282                                      HNS3_DEFAULT_DESCS_PER_LOOP + 0].mbuf);
 283                rte_prefetch0(sw_ring[pos +
 284                                      HNS3_DEFAULT_DESCS_PER_LOOP + 1].mbuf);
 285                rte_prefetch0(sw_ring[pos +
 286                                      HNS3_DEFAULT_DESCS_PER_LOOP + 2].mbuf);
 287                rte_prefetch0(sw_ring[pos +
 288                                      HNS3_DEFAULT_DESCS_PER_LOOP + 3].mbuf);
 289
 290                nb_rx += bd_valid_num;
 291                if (bd_valid_num < HNS3_DEFAULT_DESCS_PER_LOOP)
 292                        break;
 293        }
 294
 295        rxq->rx_rearm_nb += nb_rx;
 296        rxq->next_to_use += nb_rx;
 297        if (rxq->next_to_use >= rxq->nb_rx_desc)
 298                rxq->next_to_use = 0;
 299
 300        return nb_rx;
 301}
 302#endif /* _HNS3_RXTX_VEC_NEON_H_ */
 303