1
2
3
4
5#include <stdint.h>
6#include <ethdev_driver.h>
7#include <rte_malloc.h>
8
9#include "base/i40e_prototype.h"
10#include "base/i40e_type.h"
11#include "i40e_ethdev.h"
12#include "i40e_rxtx.h"
13#include "i40e_rxtx_vec_common.h"
14
15#include <rte_vect.h>
16
17#ifndef __INTEL_COMPILER
18#pragma GCC diagnostic ignored "-Wcast-qual"
19#endif
20
21static __rte_always_inline void
22i40e_rxq_rearm(struct i40e_rx_queue *rxq)
23{
24 return i40e_rxq_rearm_common(rxq, false);
25}
26
27#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
28
29
30
31
32
33
34static inline __m256i
35desc_fdir_processing_32b(volatile union i40e_rx_desc *rxdp,
36 struct rte_mbuf **rx_pkts,
37 const uint32_t pkt_idx,
38 const uint32_t desc_idx)
39{
40
41 __m128i *rxdp_desc_0 = (void *)(&rxdp[desc_idx + 0].wb.qword2);
42 __m128i *rxdp_desc_1 = (void *)(&rxdp[desc_idx + 1].wb.qword2);
43 const __m128i desc_qw2_0 = _mm_load_si128(rxdp_desc_0);
44 const __m128i desc_qw2_1 = _mm_load_si128(rxdp_desc_1);
45
46
47
48
49 const __m256i flexbh_mask = _mm256_set_epi32(-1, -1, -1, 3 << 4,
50 -1, -1, -1, 3 << 4);
51 const __m256i flexbh_id = _mm256_set_epi32(-1, -1, -1, 1 << 4,
52 -1, -1, -1, 1 << 4);
53
54
55
56
57 __m256i desc_qw2_0_1 =
58 _mm256_inserti128_si256(_mm256_castsi128_si256(desc_qw2_0),
59 desc_qw2_1, 1);
60 __m256i desc_tmp_msk = _mm256_and_si256(flexbh_mask, desc_qw2_0_1);
61 __m256i fdir_mask = _mm256_cmpeq_epi32(flexbh_id, desc_tmp_msk);
62 __m256i fdir_data = _mm256_alignr_epi8(desc_qw2_0_1, desc_qw2_0_1, 12);
63 __m256i desc_fdir_data = _mm256_and_si256(fdir_mask, fdir_data);
64
65
66
67
68 const uint32_t idx_0 = pkt_idx + desc_idx;
69 const uint32_t idx_1 = pkt_idx + desc_idx + 1;
70 rx_pkts[idx_0]->hash.fdir.hi = _mm256_extract_epi32(desc_fdir_data, 0);
71 rx_pkts[idx_1]->hash.fdir.hi = _mm256_extract_epi32(desc_fdir_data, 4);
72
73
74
75
76
77
78
79
80
81 RTE_BUILD_BUG_ON(PKT_RX_FDIR_ID != (1 << 13));
82 __m256i mbuf_flag_mask = _mm256_set_epi32(0, 0, 0, 1 << 13,
83 0, 0, 0, 1 << 13);
84 __m256i desc_flag_bit = _mm256_and_si256(mbuf_flag_mask, fdir_mask);
85
86
87
88
89 switch (desc_idx) {
90 case 0:
91 return _mm256_alignr_epi8(desc_flag_bit, desc_flag_bit, 4);
92 case 2:
93 return _mm256_alignr_epi8(desc_flag_bit, desc_flag_bit, 8);
94 case 4:
95 return _mm256_alignr_epi8(desc_flag_bit, desc_flag_bit, 12);
96 case 6:
97 return desc_flag_bit;
98 default:
99 break;
100 }
101
102
103 return _mm256_setzero_si256();
104}
105#endif
106
107#define PKTLEN_SHIFT 10
108
109
110static __rte_always_inline uint16_t
111_recv_raw_pkts_vec_avx2(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
112 uint16_t nb_pkts, uint8_t *split_packet)
113{
114#define RTE_I40E_DESCS_PER_LOOP_AVX 8
115
116 const uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;
117 const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
118 0, rxq->mbuf_initializer);
119 struct i40e_rx_entry *sw_ring = &rxq->sw_ring[rxq->rx_tail];
120 volatile union i40e_rx_desc *rxdp = rxq->rx_ring + rxq->rx_tail;
121 const int avx_aligned = ((rxq->rx_tail & 1) == 0);
122 rte_prefetch0(rxdp);
123
124
125 nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP_AVX);
126
127
128
129
130 if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)
131 i40e_rxq_rearm(rxq);
132
133
134
135
136 if (!(rxdp->wb.qword1.status_error_len &
137 rte_cpu_to_le_32(1 << I40E_RX_DESC_STATUS_DD_SHIFT)))
138 return 0;
139
140
141 const __m256i crc_adjust = _mm256_set_epi16(
142
143 0, 0, 0,
144 -rxq->crc_len,
145 0,
146 -rxq->crc_len,
147 0, 0,
148
149 0, 0, 0,
150 -rxq->crc_len,
151 0,
152 -rxq->crc_len,
153 0, 0
154 );
155
156
157 const __m256i dd_check = _mm256_set1_epi32(1);
158
159
160 const __m256i eop_check = _mm256_slli_epi32(dd_check,
161 I40E_RX_DESC_STATUS_EOF_SHIFT);
162
163
164 const __m256i shuf_msk = _mm256_set_epi8(
165
166 7, 6, 5, 4,
167 3, 2,
168 15, 14,
169 0xFF, 0xFF,
170 15, 14,
171 0xFF, 0xFF,
172 0xFF, 0xFF,
173
174 7, 6, 5, 4,
175 3, 2,
176 15, 14,
177 0xFF, 0xFF,
178 15, 14,
179 0xFF, 0xFF,
180 0xFF, 0xFF
181 );
182
183
184
185
186
187 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, pkt_len) !=
188 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 4);
189 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_len) !=
190 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 8);
191 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, vlan_tci) !=
192 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 10);
193 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, hash) !=
194 offsetof(struct rte_mbuf, rx_descriptor_fields1) + 12);
195
196
197
198
199
200
201
202
203 const __m256i flags_mask = _mm256_set1_epi32(
204 (1 << 2) | (1 << 11) | (3 << 12) | (7 << 22));
205
206
207
208
209
210 const __m256i vlan_flags_shuf = _mm256_set_epi32(
211 0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0,
212 0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0);
213
214
215
216
217
218 const __m256i rss_flags_shuf = _mm256_set_epi8(
219 0, 0, 0, 0, 0, 0, 0, 0,
220 PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH, 0, 0,
221 0, 0, PKT_RX_FDIR, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH, 0, 0,
224 0, 0, PKT_RX_FDIR, 0);
225
226
227
228
229
230 const __m256i l3_l4_flags_shuf = _mm256_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
231
232 (PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
233 PKT_RX_IP_CKSUM_BAD) >> 1,
234 (PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
235 PKT_RX_IP_CKSUM_GOOD) >> 1,
236 (PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
237 PKT_RX_IP_CKSUM_BAD) >> 1,
238 (PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
239 PKT_RX_IP_CKSUM_GOOD) >> 1,
240 (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
241 (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1,
242 (PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1,
243 (PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1,
244
245 0, 0, 0, 0, 0, 0, 0, 0,
246 (PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
247 PKT_RX_IP_CKSUM_BAD) >> 1,
248 (PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
249 PKT_RX_IP_CKSUM_GOOD) >> 1,
250 (PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
251 PKT_RX_IP_CKSUM_BAD) >> 1,
252 (PKT_RX_OUTER_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD |
253 PKT_RX_IP_CKSUM_GOOD) >> 1,
254 (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
255 (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_GOOD) >> 1,
256 (PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD) >> 1,
257 (PKT_RX_L4_CKSUM_GOOD | PKT_RX_IP_CKSUM_GOOD) >> 1);
258
259 const __m256i cksum_mask = _mm256_set1_epi32(
260 PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
261 PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
262 PKT_RX_OUTER_IP_CKSUM_BAD);
263
264 RTE_SET_USED(avx_aligned);
265
266 uint16_t i, received;
267 for (i = 0, received = 0; i < nb_pkts;
268 i += RTE_I40E_DESCS_PER_LOOP_AVX,
269 rxdp += RTE_I40E_DESCS_PER_LOOP_AVX) {
270
271 _mm256_storeu_si256((void *)&rx_pkts[i],
272 _mm256_loadu_si256((void *)&sw_ring[i]));
273#ifdef RTE_ARCH_X86_64
274 _mm256_storeu_si256((void *)&rx_pkts[i + 4],
275 _mm256_loadu_si256((void *)&sw_ring[i + 4]));
276#endif
277
278 __m256i raw_desc0_1, raw_desc2_3, raw_desc4_5, raw_desc6_7;
279#ifdef RTE_LIBRTE_I40E_16BYTE_RX_DESC
280
281 if (avx_aligned) {
282
283 raw_desc6_7 = _mm256_load_si256((void *)(rxdp + 6));
284 rte_compiler_barrier();
285 raw_desc4_5 = _mm256_load_si256((void *)(rxdp + 4));
286 rte_compiler_barrier();
287 raw_desc2_3 = _mm256_load_si256((void *)(rxdp + 2));
288 rte_compiler_barrier();
289 raw_desc0_1 = _mm256_load_si256((void *)(rxdp + 0));
290 } else
291#endif
292 do {
293 const __m128i raw_desc7 = _mm_load_si128((void *)(rxdp + 7));
294 rte_compiler_barrier();
295 const __m128i raw_desc6 = _mm_load_si128((void *)(rxdp + 6));
296 rte_compiler_barrier();
297 const __m128i raw_desc5 = _mm_load_si128((void *)(rxdp + 5));
298 rte_compiler_barrier();
299 const __m128i raw_desc4 = _mm_load_si128((void *)(rxdp + 4));
300 rte_compiler_barrier();
301 const __m128i raw_desc3 = _mm_load_si128((void *)(rxdp + 3));
302 rte_compiler_barrier();
303 const __m128i raw_desc2 = _mm_load_si128((void *)(rxdp + 2));
304 rte_compiler_barrier();
305 const __m128i raw_desc1 = _mm_load_si128((void *)(rxdp + 1));
306 rte_compiler_barrier();
307 const __m128i raw_desc0 = _mm_load_si128((void *)(rxdp + 0));
308
309 raw_desc6_7 = _mm256_inserti128_si256(
310 _mm256_castsi128_si256(raw_desc6), raw_desc7, 1);
311 raw_desc4_5 = _mm256_inserti128_si256(
312 _mm256_castsi128_si256(raw_desc4), raw_desc5, 1);
313 raw_desc2_3 = _mm256_inserti128_si256(
314 _mm256_castsi128_si256(raw_desc2), raw_desc3, 1);
315 raw_desc0_1 = _mm256_inserti128_si256(
316 _mm256_castsi128_si256(raw_desc0), raw_desc1, 1);
317 } while (0);
318
319 if (split_packet) {
320 int j;
321 for (j = 0; j < RTE_I40E_DESCS_PER_LOOP_AVX; j++)
322 rte_mbuf_prefetch_part2(rx_pkts[i + j]);
323 }
324
325
326
327
328
329 const __m256i len6_7 = _mm256_slli_epi32(raw_desc6_7, PKTLEN_SHIFT);
330 const __m256i len4_5 = _mm256_slli_epi32(raw_desc4_5, PKTLEN_SHIFT);
331 const __m256i desc6_7 = _mm256_blend_epi16(raw_desc6_7, len6_7, 0x80);
332 const __m256i desc4_5 = _mm256_blend_epi16(raw_desc4_5, len4_5, 0x80);
333 __m256i mb6_7 = _mm256_shuffle_epi8(desc6_7, shuf_msk);
334 __m256i mb4_5 = _mm256_shuffle_epi8(desc4_5, shuf_msk);
335 mb6_7 = _mm256_add_epi16(mb6_7, crc_adjust);
336 mb4_5 = _mm256_add_epi16(mb4_5, crc_adjust);
337
338
339
340
341 const __m256i ptypes6_7 = _mm256_srli_epi64(desc6_7, 30);
342 const __m256i ptypes4_5 = _mm256_srli_epi64(desc4_5, 30);
343 const uint8_t ptype7 = _mm256_extract_epi8(ptypes6_7, 24);
344 const uint8_t ptype6 = _mm256_extract_epi8(ptypes6_7, 8);
345 const uint8_t ptype5 = _mm256_extract_epi8(ptypes4_5, 24);
346 const uint8_t ptype4 = _mm256_extract_epi8(ptypes4_5, 8);
347 mb6_7 = _mm256_insert_epi32(mb6_7, ptype_tbl[ptype7], 4);
348 mb6_7 = _mm256_insert_epi32(mb6_7, ptype_tbl[ptype6], 0);
349 mb4_5 = _mm256_insert_epi32(mb4_5, ptype_tbl[ptype5], 4);
350 mb4_5 = _mm256_insert_epi32(mb4_5, ptype_tbl[ptype4], 0);
351
352 const __m256i status4_7 = _mm256_unpackhi_epi32(desc6_7,
353 desc4_5);
354
355
356
357
358
359 const __m256i len2_3 = _mm256_slli_epi32(raw_desc2_3, PKTLEN_SHIFT);
360 const __m256i len0_1 = _mm256_slli_epi32(raw_desc0_1, PKTLEN_SHIFT);
361 const __m256i desc2_3 = _mm256_blend_epi16(raw_desc2_3, len2_3, 0x80);
362 const __m256i desc0_1 = _mm256_blend_epi16(raw_desc0_1, len0_1, 0x80);
363 __m256i mb2_3 = _mm256_shuffle_epi8(desc2_3, shuf_msk);
364 __m256i mb0_1 = _mm256_shuffle_epi8(desc0_1, shuf_msk);
365 mb2_3 = _mm256_add_epi16(mb2_3, crc_adjust);
366 mb0_1 = _mm256_add_epi16(mb0_1, crc_adjust);
367
368 const __m256i ptypes2_3 = _mm256_srli_epi64(desc2_3, 30);
369 const __m256i ptypes0_1 = _mm256_srli_epi64(desc0_1, 30);
370 const uint8_t ptype3 = _mm256_extract_epi8(ptypes2_3, 24);
371 const uint8_t ptype2 = _mm256_extract_epi8(ptypes2_3, 8);
372 const uint8_t ptype1 = _mm256_extract_epi8(ptypes0_1, 24);
373 const uint8_t ptype0 = _mm256_extract_epi8(ptypes0_1, 8);
374 mb2_3 = _mm256_insert_epi32(mb2_3, ptype_tbl[ptype3], 4);
375 mb2_3 = _mm256_insert_epi32(mb2_3, ptype_tbl[ptype2], 0);
376 mb0_1 = _mm256_insert_epi32(mb0_1, ptype_tbl[ptype1], 4);
377 mb0_1 = _mm256_insert_epi32(mb0_1, ptype_tbl[ptype0], 0);
378
379 const __m256i status0_3 = _mm256_unpackhi_epi32(desc2_3,
380 desc0_1);
381
382
383
384
385
386
387 __m256i status0_7 = _mm256_unpacklo_epi64(status4_7,
388 status0_3);
389
390
391
392
393 const __m256i flag_bits = _mm256_and_si256(
394 status0_7, flags_mask);
395
396 const __m256i vlan_flags = _mm256_shuffle_epi8(
397 vlan_flags_shuf, flag_bits);
398 const __m256i rss_fdir_bits = _mm256_srli_epi32(flag_bits, 11);
399 const __m256i rss_flags = _mm256_shuffle_epi8(rss_flags_shuf,
400 rss_fdir_bits);
401
402
403
404
405
406 __m256i l3_l4_flags = _mm256_shuffle_epi8(l3_l4_flags_shuf,
407 _mm256_srli_epi32(flag_bits, 22));
408 l3_l4_flags = _mm256_slli_epi32(l3_l4_flags, 1);
409 l3_l4_flags = _mm256_and_si256(l3_l4_flags, cksum_mask);
410
411
412 __m256i mbuf_flags = _mm256_or_si256(l3_l4_flags,
413 _mm256_or_si256(rss_flags, vlan_flags));
414
415
416
417
418
419 if (rxq->fdir_enabled) {
420#ifdef RTE_LIBRTE_I40E_16BYTE_RX_DESC
421
422
423
424
425
426
427#define FDIR_BLEND_MASK ((1 << 3) | (1 << 7))
428
429
430
431
432
433
434
435
436
437
438 const __m256i fdir = _mm256_slli_epi32(rss_fdir_bits, 28);
439 const __m256i fdir_id = _mm256_set1_epi32(3 << 28);
440
441
442
443
444
445 RTE_BUILD_BUG_ON(PKT_RX_FDIR_ID != (1 << 13));
446 const __m256i pkt_fdir_bit = _mm256_set1_epi32(1 << 13);
447 const __m256i fdir_mask = _mm256_cmpeq_epi32(fdir, fdir_id);
448 __m256i fdir_bits = _mm256_and_si256(fdir_mask, pkt_fdir_bit);
449 mbuf_flags = _mm256_or_si256(mbuf_flags, fdir_bits);
450
451
452
453
454
455 const __m256i fdir_zero_mask = _mm256_setzero_si256();
456 __m256i tmp0_1 = _mm256_blend_epi32(fdir_zero_mask,
457 fdir_mask, FDIR_BLEND_MASK);
458 __m256i fdir_mb0_1 = _mm256_and_si256(mb0_1, fdir_mask);
459 mb0_1 = _mm256_andnot_si256(tmp0_1, mb0_1);
460
461
462
463
464 rx_pkts[i + 0]->hash.fdir.hi = _mm256_extract_epi32(fdir_mb0_1, 3);
465 rx_pkts[i + 1]->hash.fdir.hi = _mm256_extract_epi32(fdir_mb0_1, 7);
466
467
468
469
470 __m256i tmp2_3 = _mm256_alignr_epi8(fdir_mask, fdir_mask, 12);
471 __m256i fdir_mb2_3 = _mm256_and_si256(mb2_3, tmp2_3);
472 tmp2_3 = _mm256_blend_epi32(fdir_zero_mask, tmp2_3,
473 FDIR_BLEND_MASK);
474 mb2_3 = _mm256_andnot_si256(tmp2_3, mb2_3);
475 rx_pkts[i + 2]->hash.fdir.hi = _mm256_extract_epi32(fdir_mb2_3, 3);
476 rx_pkts[i + 3]->hash.fdir.hi = _mm256_extract_epi32(fdir_mb2_3, 7);
477
478 __m256i tmp4_5 = _mm256_alignr_epi8(fdir_mask, fdir_mask, 8);
479 __m256i fdir_mb4_5 = _mm256_and_si256(mb4_5, tmp4_5);
480 tmp4_5 = _mm256_blend_epi32(fdir_zero_mask, tmp4_5,
481 FDIR_BLEND_MASK);
482 mb4_5 = _mm256_andnot_si256(tmp4_5, mb4_5);
483 rx_pkts[i + 4]->hash.fdir.hi = _mm256_extract_epi32(fdir_mb4_5, 3);
484 rx_pkts[i + 5]->hash.fdir.hi = _mm256_extract_epi32(fdir_mb4_5, 7);
485
486 __m256i tmp6_7 = _mm256_alignr_epi8(fdir_mask, fdir_mask, 4);
487 __m256i fdir_mb6_7 = _mm256_and_si256(mb6_7, tmp6_7);
488 tmp6_7 = _mm256_blend_epi32(fdir_zero_mask, tmp6_7,
489 FDIR_BLEND_MASK);
490 mb6_7 = _mm256_andnot_si256(tmp6_7, mb6_7);
491 rx_pkts[i + 6]->hash.fdir.hi = _mm256_extract_epi32(fdir_mb6_7, 3);
492 rx_pkts[i + 7]->hash.fdir.hi = _mm256_extract_epi32(fdir_mb6_7, 7);
493
494
495#else
496
497
498
499 __m256i fdir_add_flags;
500 fdir_add_flags = desc_fdir_processing_32b(rxdp, rx_pkts, i, 0);
501 mbuf_flags = _mm256_or_si256(mbuf_flags, fdir_add_flags);
502
503 fdir_add_flags = desc_fdir_processing_32b(rxdp, rx_pkts, i, 2);
504 mbuf_flags = _mm256_or_si256(mbuf_flags, fdir_add_flags);
505
506 fdir_add_flags = desc_fdir_processing_32b(rxdp, rx_pkts, i, 4);
507 mbuf_flags = _mm256_or_si256(mbuf_flags, fdir_add_flags);
508
509 fdir_add_flags = desc_fdir_processing_32b(rxdp, rx_pkts, i, 6);
510 mbuf_flags = _mm256_or_si256(mbuf_flags, fdir_add_flags);
511
512#endif
513
514 }
515
516
517
518
519
520
521
522
523
524
525
526
527
528 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
529 offsetof(struct rte_mbuf, rearm_data) + 8);
530 RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
531 RTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16));
532
533 __m256i rearm0, rearm1, rearm2, rearm3, rearm4, rearm5,
534 rearm6, rearm7;
535 rearm6 = _mm256_blend_epi32(mbuf_init, _mm256_slli_si256(mbuf_flags, 8), 0x04);
536 rearm4 = _mm256_blend_epi32(mbuf_init, _mm256_slli_si256(mbuf_flags, 4), 0x04);
537 rearm2 = _mm256_blend_epi32(mbuf_init, mbuf_flags, 0x04);
538 rearm0 = _mm256_blend_epi32(mbuf_init, _mm256_srli_si256(mbuf_flags, 4), 0x04);
539
540 rearm6 = _mm256_permute2f128_si256(rearm6, mb6_7, 0x20);
541 rearm4 = _mm256_permute2f128_si256(rearm4, mb4_5, 0x20);
542 rearm2 = _mm256_permute2f128_si256(rearm2, mb2_3, 0x20);
543 rearm0 = _mm256_permute2f128_si256(rearm0, mb0_1, 0x20);
544
545 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 6]->rearm_data, rearm6);
546 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 4]->rearm_data, rearm4);
547 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 2]->rearm_data, rearm2);
548 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 0]->rearm_data, rearm0);
549
550
551 const __m256i odd_flags = _mm256_castsi128_si256(
552 _mm256_extracti128_si256(mbuf_flags, 1));
553 rearm7 = _mm256_blend_epi32(mbuf_init, _mm256_slli_si256(odd_flags, 8), 0x04);
554 rearm5 = _mm256_blend_epi32(mbuf_init, _mm256_slli_si256(odd_flags, 4), 0x04);
555 rearm3 = _mm256_blend_epi32(mbuf_init, odd_flags, 0x04);
556 rearm1 = _mm256_blend_epi32(mbuf_init, _mm256_srli_si256(odd_flags, 4), 0x04);
557
558 rearm7 = _mm256_blend_epi32(rearm7, mb6_7, 0xF0);
559 rearm5 = _mm256_blend_epi32(rearm5, mb4_5, 0xF0);
560 rearm3 = _mm256_blend_epi32(rearm3, mb2_3, 0xF0);
561 rearm1 = _mm256_blend_epi32(rearm1, mb0_1, 0xF0);
562
563 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 7]->rearm_data, rearm7);
564 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 5]->rearm_data, rearm5);
565 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 3]->rearm_data, rearm3);
566 _mm256_storeu_si256((__m256i *)&rx_pkts[i + 1]->rearm_data, rearm1);
567
568
569 if (split_packet) {
570 const __m128i eop_mask = _mm_set1_epi16(
571 1 << I40E_RX_DESC_STATUS_EOF_SHIFT);
572 const __m256i eop_bits256 = _mm256_and_si256(status0_7,
573 eop_check);
574
575 const __m128i eop_bits = _mm_packus_epi32(
576 _mm256_castsi256_si128(eop_bits256),
577 _mm256_extractf128_si256(eop_bits256, 1));
578
579
580
581
582 __m128i split_bits = _mm_andnot_si128(eop_bits,
583 eop_mask);
584
585
586
587
588
589
590
591
592 __m128i eop_shuffle = _mm_set_epi8(
593 0xFF, 0xFF, 0xFF, 0xFF,
594 0xFF, 0xFF, 0xFF, 0xFF,
595 8, 0, 10, 2,
596 12, 4, 14, 6);
597 split_bits = _mm_shuffle_epi8(split_bits, eop_shuffle);
598 *(uint64_t *)split_packet = _mm_cvtsi128_si64(split_bits);
599 split_packet += RTE_I40E_DESCS_PER_LOOP_AVX;
600 }
601
602
603 status0_7 = _mm256_and_si256(status0_7, dd_check);
604 status0_7 = _mm256_packs_epi32(status0_7,
605 _mm256_setzero_si256());
606
607 uint64_t burst = __builtin_popcountll(_mm_cvtsi128_si64(
608 _mm256_extracti128_si256(status0_7, 1)));
609 burst += __builtin_popcountll(_mm_cvtsi128_si64(
610 _mm256_castsi256_si128(status0_7)));
611 received += burst;
612 if (burst != RTE_I40E_DESCS_PER_LOOP_AVX)
613 break;
614 }
615
616
617 rxq->rx_tail += received;
618 rxq->rx_tail &= (rxq->nb_rx_desc - 1);
619 if ((rxq->rx_tail & 1) == 1 && received > 1) {
620 rxq->rx_tail--;
621 received--;
622 }
623 rxq->rxrearm_nb += received;
624 return received;
625}
626
627
628
629
630
631uint16_t
632i40e_recv_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
633 uint16_t nb_pkts)
634{
635 return _recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts, NULL);
636}
637
638
639
640
641
642
643static uint16_t
644i40e_recv_scattered_burst_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
645 uint16_t nb_pkts)
646{
647 struct i40e_rx_queue *rxq = rx_queue;
648 uint8_t split_flags[RTE_I40E_VPMD_RX_BURST] = {0};
649
650
651 uint16_t nb_bufs = _recv_raw_pkts_vec_avx2(rxq, rx_pkts, nb_pkts,
652 split_flags);
653 if (nb_bufs == 0)
654 return 0;
655
656
657 const uint64_t *split_fl64 = (uint64_t *)split_flags;
658
659 if (rxq->pkt_first_seg == NULL &&
660 split_fl64[0] == 0 && split_fl64[1] == 0 &&
661 split_fl64[2] == 0 && split_fl64[3] == 0)
662 return nb_bufs;
663
664
665 unsigned int i = 0;
666
667 if (rxq->pkt_first_seg == NULL) {
668
669 while (i < nb_bufs && !split_flags[i])
670 i++;
671 if (i == nb_bufs)
672 return nb_bufs;
673 rxq->pkt_first_seg = rx_pkts[i];
674 }
675 return i + reassemble_packets(rxq, &rx_pkts[i], nb_bufs - i,
676 &split_flags[i]);
677}
678
679
680
681
682
683
684
685uint16_t
686i40e_recv_scattered_pkts_vec_avx2(void *rx_queue, struct rte_mbuf **rx_pkts,
687 uint16_t nb_pkts)
688{
689 uint16_t retval = 0;
690 while (nb_pkts > RTE_I40E_VPMD_RX_BURST) {
691 uint16_t burst = i40e_recv_scattered_burst_vec_avx2(rx_queue,
692 rx_pkts + retval, RTE_I40E_VPMD_RX_BURST);
693 retval += burst;
694 nb_pkts -= burst;
695 if (burst < RTE_I40E_VPMD_RX_BURST)
696 return retval;
697 }
698 return retval + i40e_recv_scattered_burst_vec_avx2(rx_queue,
699 rx_pkts + retval, nb_pkts);
700}
701
702
703static inline void
704vtx1(volatile struct i40e_tx_desc *txdp,
705 struct rte_mbuf *pkt, uint64_t flags)
706{
707 uint64_t high_qw = (I40E_TX_DESC_DTYPE_DATA |
708 ((uint64_t)flags << I40E_TXD_QW1_CMD_SHIFT) |
709 ((uint64_t)pkt->data_len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT));
710
711 __m128i descriptor = _mm_set_epi64x(high_qw,
712 pkt->buf_iova + pkt->data_off);
713 _mm_store_si128((__m128i *)txdp, descriptor);
714}
715
716static inline void
717vtx(volatile struct i40e_tx_desc *txdp,
718 struct rte_mbuf **pkt, uint16_t nb_pkts, uint64_t flags)
719{
720 const uint64_t hi_qw_tmpl = (I40E_TX_DESC_DTYPE_DATA |
721 ((uint64_t)flags << I40E_TXD_QW1_CMD_SHIFT));
722
723
724 if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
725 vtx1(txdp, *pkt, flags);
726 nb_pkts--, txdp++, pkt++;
727 }
728
729
730 for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {
731 uint64_t hi_qw3 = hi_qw_tmpl |
732 ((uint64_t)pkt[3]->data_len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT);
733 uint64_t hi_qw2 = hi_qw_tmpl |
734 ((uint64_t)pkt[2]->data_len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT);
735 uint64_t hi_qw1 = hi_qw_tmpl |
736 ((uint64_t)pkt[1]->data_len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT);
737 uint64_t hi_qw0 = hi_qw_tmpl |
738 ((uint64_t)pkt[0]->data_len << I40E_TXD_QW1_TX_BUF_SZ_SHIFT);
739
740 __m256i desc2_3 = _mm256_set_epi64x(
741 hi_qw3, pkt[3]->buf_iova + pkt[3]->data_off,
742 hi_qw2, pkt[2]->buf_iova + pkt[2]->data_off);
743 __m256i desc0_1 = _mm256_set_epi64x(
744 hi_qw1, pkt[1]->buf_iova + pkt[1]->data_off,
745 hi_qw0, pkt[0]->buf_iova + pkt[0]->data_off);
746 _mm256_store_si256((void *)(txdp + 2), desc2_3);
747 _mm256_store_si256((void *)txdp, desc0_1);
748 }
749
750
751 while (nb_pkts) {
752 vtx1(txdp, *pkt, flags);
753 txdp++, pkt++, nb_pkts--;
754 }
755}
756
757static inline uint16_t
758i40e_xmit_fixed_burst_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
759 uint16_t nb_pkts)
760{
761 struct i40e_tx_queue *txq = (struct i40e_tx_queue *)tx_queue;
762 volatile struct i40e_tx_desc *txdp;
763 struct i40e_tx_entry *txep;
764 uint16_t n, nb_commit, tx_id;
765 uint64_t flags = I40E_TD_CMD;
766 uint64_t rs = I40E_TX_DESC_CMD_RS | I40E_TD_CMD;
767
768
769 nb_pkts = RTE_MIN(nb_pkts, txq->tx_rs_thresh);
770
771 if (txq->nb_tx_free < txq->tx_free_thresh)
772 i40e_tx_free_bufs(txq);
773
774 nb_commit = nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
775 if (unlikely(nb_pkts == 0))
776 return 0;
777
778 tx_id = txq->tx_tail;
779 txdp = &txq->tx_ring[tx_id];
780 txep = &txq->sw_ring[tx_id];
781
782 txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
783
784 n = (uint16_t)(txq->nb_tx_desc - tx_id);
785 if (nb_commit >= n) {
786 tx_backlog_entry(txep, tx_pkts, n);
787
788 vtx(txdp, tx_pkts, n - 1, flags);
789 tx_pkts += (n - 1);
790 txdp += (n - 1);
791
792 vtx1(txdp, *tx_pkts++, rs);
793
794 nb_commit = (uint16_t)(nb_commit - n);
795
796 tx_id = 0;
797 txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
798
799
800 txdp = &txq->tx_ring[tx_id];
801 txep = &txq->sw_ring[tx_id];
802 }
803
804 tx_backlog_entry(txep, tx_pkts, nb_commit);
805
806 vtx(txdp, tx_pkts, nb_commit, flags);
807
808 tx_id = (uint16_t)(tx_id + nb_commit);
809 if (tx_id > txq->tx_next_rs) {
810 txq->tx_ring[txq->tx_next_rs].cmd_type_offset_bsz |=
811 rte_cpu_to_le_64(((uint64_t)I40E_TX_DESC_CMD_RS) <<
812 I40E_TXD_QW1_CMD_SHIFT);
813 txq->tx_next_rs =
814 (uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh);
815 }
816
817 txq->tx_tail = tx_id;
818
819 I40E_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
820
821 return nb_pkts;
822}
823
824uint16_t
825i40e_xmit_pkts_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
826 uint16_t nb_pkts)
827{
828 uint16_t nb_tx = 0;
829 struct i40e_tx_queue *txq = (struct i40e_tx_queue *)tx_queue;
830
831 while (nb_pkts) {
832 uint16_t ret, num;
833
834 num = (uint16_t)RTE_MIN(nb_pkts, txq->tx_rs_thresh);
835 ret = i40e_xmit_fixed_burst_vec_avx2(tx_queue, &tx_pkts[nb_tx],
836 num);
837 nb_tx += ret;
838 nb_pkts -= ret;
839 if (ret < num)
840 break;
841 }
842
843 return nb_tx;
844}
845