1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25static __rte_always_inline _T_simd
26_F_(calc_addr)(_T_simd index_mask, _T_simd next_input, _T_simd shuffle_input,
27 _T_simd four_32, _T_simd range_base, _T_simd tr_lo, _T_simd tr_hi)
28{
29 __mmask64 qm;
30 _T_mask dfa_msk;
31 _T_simd addr, in, node_type, r, t;
32 _T_simd dfa_ofs, quad_ofs;
33
34 t = _M_SI_(xor)(index_mask, index_mask);
35 in = _M_I_(shuffle_epi8)(next_input, shuffle_input);
36
37
38 node_type = _M_SI_(andnot)(index_mask, tr_lo);
39 addr = _M_SI_(and)(index_mask, tr_lo);
40
41
42 dfa_msk = _M_I_(cmpeq_epi32_mask)(node_type, t);
43
44
45 r = _M_I_(srli_epi32)(in, 30);
46 r = _M_I_(add_epi8)(r, range_base);
47 t = _M_I_(srli_epi32)(in, 24);
48 r = _M_I_(shuffle_epi8)(tr_hi, r);
49
50 dfa_ofs = _M_I_(sub_epi32)(t, r);
51
52
53 qm = _M_I_(cmpgt_epi8_mask)(in, tr_hi);
54 t = _M_I_(maskz_set1_epi8)(qm, (uint8_t)UINT8_MAX);
55 t = _M_I_(lzcnt_epi32)(t);
56 t = _M_I_(srli_epi32)(t, 3);
57 quad_ofs = _M_I_(sub_epi32)(four_32, t);
58
59
60 t = _M_I_(mask_mov_epi32)(quad_ofs, dfa_msk, dfa_ofs);
61
62
63 addr = _M_I_(add_epi32)(addr, t);
64 return addr;
65}
66
67
68
69
70
71
72
73static __rte_always_inline _T_simd
74_F_(trans)(_T_simd next_input, const uint64_t *trans, _T_simd *tr_lo,
75 _T_simd *tr_hi)
76{
77 const int32_t *tr;
78 _T_simd addr;
79
80 tr = (const int32_t *)(uintptr_t)trans;
81
82
83 addr = _F_(calc_addr)(_SV_(index_mask), next_input, _SV_(shuffle_input),
84 _SV_(four_32), _SV_(range_base), *tr_lo, *tr_hi);
85
86
87 *tr_lo = _M_GI_(i32gather_epi32, addr, tr, sizeof(trans[0]));
88
89 next_input = _M_I_(srli_epi32)(next_input, CHAR_BIT);
90
91
92 *tr_hi = _M_GI_(i32gather_epi32, addr, (tr + 1), sizeof(trans[0]));
93
94 return next_input;
95}
96
97
98
99
100
101
102
103
104static __rte_always_inline void
105_F_(first_trans)(const struct acl_flow_avx512 *flow, _T_simd next_input,
106 _T_mask msk, _T_simd *tr_lo, _T_simd *tr_hi)
107{
108 const int32_t *tr;
109 _T_simd addr, root;
110
111 tr = (const int32_t *)(uintptr_t)flow->trans;
112
113 addr = _M_I_(set1_epi32)(UINT8_MAX);
114 root = _M_I_(set1_epi32)(flow->root_index);
115
116 addr = _M_SI_(and)(next_input, addr);
117 addr = _M_I_(add_epi32)(root, addr);
118
119
120 *tr_lo = _M_MGI_(mask_i32gather_epi32)(*tr_lo, msk, addr, tr,
121 sizeof(flow->trans[0]));
122
123
124 *tr_hi = _M_MGI_(mask_i32gather_epi32)(*tr_hi, msk, addr, (tr + 1),
125 sizeof(flow->trans[0]));
126}
127
128
129
130
131
132
133
134static inline _T_simd
135_F_(get_next_bytes)(const struct acl_flow_avx512 *flow, _T_simd pdata[2],
136 uint32_t msk, _T_simd *di, uint32_t bnum)
137{
138 const int32_t *div;
139 uint32_t m[2];
140 _T_simd one, zero, t, p[2];
141
142 div = (const int32_t *)flow->data_index;
143
144 one = _M_I_(set1_epi32)(1);
145 zero = _M_SI_(xor)(one, one);
146
147
148 t = _M_MGI_(mask_i32gather_epi32)(zero, msk, *di, div, sizeof(div[0]));
149
150
151 *di = _M_I_(mask_add_epi32)(*di, msk, *di, one);
152
153
154
155
156
157
158
159 p[0] = _M_I_(maskz_permutexvar_epi32)(_SC_(pmidx_msk), _SV_(pmidx[0]),
160 t);
161 p[1] = _M_I_(maskz_permutexvar_epi32)(_SC_(pmidx_msk), _SV_(pmidx[1]),
162 t);
163
164 p[0] = _M_I_(add_epi64)(p[0], pdata[0]);
165 p[1] = _M_I_(add_epi64)(p[1], pdata[1]);
166
167
168
169 m[0] = msk & _SIMD_PTR_MSK_;
170 m[1] = msk >> _SIMD_PTR_NUM_;
171
172 return _F_(gather_bytes)(zero, p, m, bnum);
173}
174
175
176
177
178
179
180
181
182
183static inline void
184_F_(start_flow)(struct acl_flow_avx512 *flow, uint32_t num, uint32_t msk,
185 _T_simd pdata[2], _T_simd *idx, _T_simd *di)
186{
187 uint32_t n, m[2], nm[2];
188 _T_simd ni, nd[2];
189
190
191 m[0] = msk & _SIMD_PTR_MSK_;
192 m[1] = msk >> _SIMD_PTR_NUM_;
193
194
195 n = __builtin_popcount(m[0]);
196 nm[0] = (1 << n) - 1;
197 nm[1] = (1 << (num - n)) - 1;
198
199
200 nd[0] = _M_I_(maskz_loadu_epi64)(nm[0],
201 flow->idata + flow->num_packets);
202 nd[1] = _M_I_(maskz_loadu_epi64)(nm[1],
203 flow->idata + flow->num_packets + n);
204
205
206 ni = _M_I_(set1_epi32)(flow->num_packets);
207 ni = _M_I_(add_epi32)(ni, _SV_(idx_add));
208
209
210 pdata[0] = _M_I_(mask_expand_epi64)(pdata[0], m[0], nd[0]);
211 pdata[1] = _M_I_(mask_expand_epi64)(pdata[1], m[1], nd[1]);
212
213
214 *idx = _M_I_(mask_expand_epi32)(*idx, msk, ni);
215 *di = _M_I_(maskz_mov_epi32)(msk ^ _SIMD_MASK_MAX_, *di);
216
217 flow->num_packets += num;
218}
219
220
221
222
223
224
225
226
227
228
229
230static inline uint32_t
231_F_(match_process)(struct acl_flow_avx512 *flow, uint32_t *fmsk,
232 uint32_t *rmsk, _T_simd pdata[2], _T_simd *di, _T_simd *idx,
233 _T_simd *tr_lo, _T_simd *tr_hi)
234{
235 uint32_t n;
236 _T_simd res;
237
238 if (rmsk[0] == 0)
239 return 0;
240
241
242 res = _M_SI_(and)(tr_lo[0], _SV_(index_mask));
243
244
245 tr_lo[0] = _M_I_(mask_mov_epi32)(tr_lo[0], rmsk[0], _SV_(trlo_idle));
246 tr_hi[0] = _M_I_(mask_mov_epi32)(tr_hi[0], rmsk[0], _SV_(trhi_idle));
247
248
249 _M_I_(mask_i32scatter_epi32)((void *)flow->matches, rmsk[0], idx[0],
250 res, sizeof(flow->matches[0]));
251
252
253 n = update_flow_mask(flow, fmsk, rmsk);
254 _F_(start_flow)(flow, n, rmsk[0], pdata, idx, di);
255
256 return n;
257}
258
259
260
261
262
263static inline void
264_F_(match_check_process)(struct acl_flow_avx512 *flow, uint32_t fm[2],
265 _T_simd pdata[4], _T_simd di[2], _T_simd idx[2], _T_simd inp[2],
266 _T_simd tr_lo[2], _T_simd tr_hi[2])
267{
268 uint32_t n[2];
269 uint32_t rm[2];
270
271
272 rm[0] = _M_I_(test_epi32_mask)(tr_lo[0], _SV_(match_mask));
273 rm[1] = _M_I_(test_epi32_mask)(tr_lo[1], _SV_(match_mask));
274
275
276 while ((rm[0] | rm[1]) != 0) {
277
278
279 n[0] = _F_(match_process)(flow, &fm[0], &rm[0], &pdata[0],
280 &di[0], &idx[0], &tr_lo[0], &tr_hi[0]);
281 n[1] = _F_(match_process)(flow, &fm[1], &rm[1], &pdata[2],
282 &di[1], &idx[1], &tr_lo[1], &tr_hi[1]);
283
284
285
286 if (n[0] != 0) {
287 inp[0] = _F_(get_next_bytes)(flow, &pdata[0],
288 rm[0], &di[0], flow->first_load_sz);
289 _F_(first_trans)(flow, inp[0], rm[0], &tr_lo[0],
290 &tr_hi[0]);
291 rm[0] = _M_I_(test_epi32_mask)(tr_lo[0],
292 _SV_(match_mask));
293 }
294
295 if (n[1] != 0) {
296 inp[1] = _F_(get_next_bytes)(flow, &pdata[2],
297 rm[1], &di[1], flow->first_load_sz);
298 _F_(first_trans)(flow, inp[1], rm[1], &tr_lo[1],
299 &tr_hi[1]);
300 rm[1] = _M_I_(test_epi32_mask)(tr_lo[1],
301 _SV_(match_mask));
302 }
303 }
304}
305
306static inline void
307_F_(reset_flow_vars)(_T_simd di[2], _T_simd idx[2], _T_simd pdata[4],
308 _T_simd tr_lo[2], _T_simd tr_hi[2])
309{
310 di[0] = _M_SI_(setzero)();
311 di[1] = _M_SI_(setzero)();
312
313 idx[0] = _M_SI_(setzero)();
314 idx[1] = _M_SI_(setzero)();
315
316 pdata[0] = _M_SI_(setzero)();
317 pdata[1] = _M_SI_(setzero)();
318 pdata[2] = _M_SI_(setzero)();
319 pdata[3] = _M_SI_(setzero)();
320
321 tr_lo[0] = _M_SI_(setzero)();
322 tr_lo[1] = _M_SI_(setzero)();
323
324 tr_hi[0] = _M_SI_(setzero)();
325 tr_hi[1] = _M_SI_(setzero)();
326}
327
328
329
330
331
332static inline void
333_F_(search_trie)(struct acl_flow_avx512 *flow)
334{
335 uint32_t fm[2];
336 _T_simd di[2], idx[2], in[2], pdata[4], tr_lo[2], tr_hi[2];
337
338 _F_(reset_flow_vars)(di, idx, pdata, tr_lo, tr_hi);
339
340
341 _F_(start_flow)(flow, _SIMD_MASK_BIT_, _SIMD_MASK_MAX_,
342 &pdata[0], &idx[0], &di[0]);
343 _F_(start_flow)(flow, _SIMD_MASK_BIT_, _SIMD_MASK_MAX_,
344 &pdata[2], &idx[1], &di[1]);
345
346 in[0] = _F_(get_next_bytes)(flow, &pdata[0], _SIMD_MASK_MAX_, &di[0],
347 flow->first_load_sz);
348 in[1] = _F_(get_next_bytes)(flow, &pdata[2], _SIMD_MASK_MAX_, &di[1],
349 flow->first_load_sz);
350
351 _F_(first_trans)(flow, in[0], _SIMD_MASK_MAX_, &tr_lo[0], &tr_hi[0]);
352 _F_(first_trans)(flow, in[1], _SIMD_MASK_MAX_, &tr_lo[1], &tr_hi[1]);
353
354 fm[0] = _SIMD_MASK_MAX_;
355 fm[1] = _SIMD_MASK_MAX_;
356
357
358 _F_(match_check_process)(flow, fm, pdata, di, idx, in, tr_lo, tr_hi);
359
360 while ((fm[0] | fm[1]) != 0) {
361
362
363
364 in[0] = _F_(get_next_bytes)(flow, &pdata[0], fm[0],
365 &di[0], sizeof(uint32_t));
366 in[1] = _F_(get_next_bytes)(flow, &pdata[2], fm[1],
367 &di[1], sizeof(uint32_t));
368
369
370
371 in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
372 in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
373
374 in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
375 in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
376
377 in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
378 in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
379
380 in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
381 in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
382
383
384 _F_(match_check_process)(flow, fm, pdata, di, idx, in,
385 tr_lo, tr_hi);
386 }
387}
388
389
390
391
392static inline _T_simd
393_F_(resolve_match_idx)(_T_simd mi)
394{
395 RTE_BUILD_BUG_ON(sizeof(struct rte_acl_match_results) !=
396 1 << (ACL_MATCH_LOG + 2));
397 return _M_I_(slli_epi32)(mi, ACL_MATCH_LOG);
398}
399
400
401
402
403static inline _T_simd
404_F_(resolve_pri)(const int32_t res[], const int32_t pri[],
405 const uint32_t match[], _T_mask msk, uint32_t nb_trie,
406 uint32_t nb_skip)
407{
408 uint32_t i;
409 const uint32_t *pm;
410 _T_mask m;
411 _T_simd cp, cr, np, nr, mch;
412
413 const _T_simd zero = _M_I_(set1_epi32)(0);
414
415
416 mch = _M_I_(maskz_loadu_epi32)(msk, match);
417 mch = _F_(resolve_match_idx)(mch);
418
419
420 cr = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, res, sizeof(res[0]));
421 cp = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, pri, sizeof(pri[0]));
422
423
424
425
426
427 for (i = 1, pm = match + nb_skip; i != nb_trie;
428 i++, pm += nb_skip) {
429
430 mch = _M_I_(maskz_loadu_epi32)(msk, pm);
431 mch = _F_(resolve_match_idx)(mch);
432
433 nr = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, res,
434 sizeof(res[0]));
435 np = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, pri,
436 sizeof(pri[0]));
437
438 m = _M_I_(cmpgt_epi32_mask)(cp, np);
439 cr = _M_I_(mask_mov_epi32)(nr, m, cr);
440 cp = _M_I_(mask_mov_epi32)(np, m, cp);
441 }
442
443 return cr;
444}
445
446
447
448
449static inline void
450_F_(resolve_sc)(uint32_t result[], const int32_t res[],
451 const int32_t pri[], const uint32_t match[], uint32_t nb_pkt,
452 uint32_t nb_trie, uint32_t nb_skip)
453{
454 _T_mask msk;
455 _T_simd cr;
456
457 msk = (1 << nb_pkt) - 1;
458 cr = _F_(resolve_pri)(res, pri, match, msk, nb_trie, nb_skip);
459 _M_I_(mask_storeu_epi32)(result, msk, cr);
460}
461
462
463
464
465static inline void
466_F_(resolve_single_cat)(uint32_t result[],
467 const struct rte_acl_match_results pr[], const uint32_t match[],
468 uint32_t nb_pkt, uint32_t nb_trie)
469{
470 uint32_t j, k, n;
471 const int32_t *res, *pri;
472 _T_simd cr[2];
473
474 res = (const int32_t *)pr->results;
475 pri = pr->priority;
476
477 for (k = 0; k != (nb_pkt & ~_SIMD_FLOW_MSK_); k += _SIMD_FLOW_NUM_) {
478
479 j = k + _SIMD_MASK_BIT_;
480
481 cr[0] = _F_(resolve_pri)(res, pri, match + k, _SIMD_MASK_MAX_,
482 nb_trie, nb_pkt);
483 cr[1] = _F_(resolve_pri)(res, pri, match + j, _SIMD_MASK_MAX_,
484 nb_trie, nb_pkt);
485
486 _M_SI_(storeu)((void *)(result + k), cr[0]);
487 _M_SI_(storeu)((void *)(result + j), cr[1]);
488 }
489
490 n = nb_pkt - k;
491 if (n != 0) {
492 if (n > _SIMD_MASK_BIT_) {
493 _F_(resolve_sc)(result + k, res, pri, match + k,
494 _SIMD_MASK_BIT_, nb_trie, nb_pkt);
495 k += _SIMD_MASK_BIT_;
496 n -= _SIMD_MASK_BIT_;
497 }
498 _F_(resolve_sc)(result + k, res, pri, match + k, n,
499 nb_trie, nb_pkt);
500 }
501}
502