LXR dpdk/lib/acl/acl_run_avx512

   1/* SPDX-License-Identifier: BSD-3-Clause
   2 * Copyright(c) 2020 Intel Corporation
   3 */
   4
   5/*
   6 * WARNING: It is not recommended to include this file directly.
   7 * Please include "acl_run_avx512x*.h" instead.
   8 * To make this file to generate proper code an includer has to
   9 * define several macros, refer to "acl_run_avx512x*.h" for more details.
  10 */
  11
  12/*
  13 * Calculate the address of the next transition for
  14 * all types of nodes. Note that only DFA nodes and range
  15 * nodes actually transition to another node. Match
  16 * nodes not supposed to be encountered here.
  17 * For quad range nodes:
  18 * Calculate number of range boundaries that are less than the
  19 * input value. Range boundaries for each node are in signed 8 bit,
  20 * ordered from -128 to 127.
  21 * This is effectively a popcnt of bytes that are greater than the
  22 * input byte.
  23 * Single nodes are processed in the same ways as quad range nodes.
  24 */
  25static __rte_always_inline _T_simd
  26_F_(calc_addr)(_T_simd index_mask, _T_simd next_input, _T_simd shuffle_input,
  27        _T_simd four_32, _T_simd range_base, _T_simd tr_lo, _T_simd tr_hi)
  28{
  29        __mmask64 qm;
  30        _T_mask dfa_msk;
  31        _T_simd addr, in, node_type, r, t;
  32        _T_simd dfa_ofs, quad_ofs;
  33
  34        t = _M_SI_(xor)(index_mask, index_mask);
  35        in = _M_I_(shuffle_epi8)(next_input, shuffle_input);
  36
  37        /* Calc node type and node addr */
  38        node_type = _M_SI_(andnot)(index_mask, tr_lo);
  39        addr = _M_SI_(and)(index_mask, tr_lo);
  40
  41        /* mask for DFA type(0) nodes */
  42        dfa_msk = _M_I_(cmpeq_epi32_mask)(node_type, t);
  43
  44        /* DFA calculations. */
  45        r = _M_I_(srli_epi32)(in, 30);
  46        r = _M_I_(add_epi8)(r, range_base);
  47        t = _M_I_(srli_epi32)(in, 24);
  48        r = _M_I_(shuffle_epi8)(tr_hi, r);
  49
  50        dfa_ofs = _M_I_(sub_epi32)(t, r);
  51
  52        /* QUAD/SINGLE calculations. */
  53        qm = _M_I_(cmpgt_epi8_mask)(in, tr_hi);
  54        t = _M_I_(maskz_set1_epi8)(qm, (uint8_t)UINT8_MAX);
  55        t = _M_I_(lzcnt_epi32)(t);
  56        t = _M_I_(srli_epi32)(t, 3);
  57        quad_ofs = _M_I_(sub_epi32)(four_32, t);
  58
  59        /* blend DFA and QUAD/SINGLE. */
  60        t = _M_I_(mask_mov_epi32)(quad_ofs, dfa_msk, dfa_ofs);
  61
  62        /* calculate address for next transitions. */
  63        addr = _M_I_(add_epi32)(addr, t);
  64        return addr;
  65}
  66
  67/*
  68 * Process _N_ transitions in parallel.
  69 * tr_lo contains low 32 bits for _N_ transition.
  70 * tr_hi contains high 32 bits for _N_ transition.
  71 * next_input contains up to 4 input bytes for _N_ flows.
  72 */
  73static __rte_always_inline _T_simd
  74_F_(trans)(_T_simd next_input, const uint64_t *trans, _T_simd *tr_lo,
  75        _T_simd *tr_hi)
  76{
  77        const int32_t *tr;
  78        _T_simd addr;
  79
  80        tr = (const int32_t *)(uintptr_t)trans;
  81
  82        /* Calculate the address (array index) for all _N_ transitions. */
  83        addr = _F_(calc_addr)(_SV_(index_mask), next_input, _SV_(shuffle_input),
  84                _SV_(four_32), _SV_(range_base), *tr_lo, *tr_hi);
  85
  86        /* load lower 32 bits of _N_ transactions at once. */
  87        *tr_lo = _M_GI_(i32gather_epi32, addr, tr, sizeof(trans[0]));
  88
  89        next_input = _M_I_(srli_epi32)(next_input, CHAR_BIT);
  90
  91        /* load high 32 bits of _N_ transactions at once. */
  92        *tr_hi = _M_GI_(i32gather_epi32, addr, (tr + 1), sizeof(trans[0]));
  93
  94        return next_input;
  95}
  96
  97/*
  98 * Execute first transition for up to _N_ flows in parallel.
  99 * next_input should contain one input byte for up to _N_ flows.
 100 * msk - mask of active flows.
 101 * tr_lo contains low 32 bits for up to _N_ transitions.
 102 * tr_hi contains high 32 bits for up to _N_ transitions.
 103 */
 104static __rte_always_inline void
 105_F_(first_trans)(const struct acl_flow_avx512 *flow, _T_simd next_input,
 106        _T_mask msk, _T_simd *tr_lo, _T_simd *tr_hi)
 107{
 108        const int32_t *tr;
 109        _T_simd addr, root;
 110
 111        tr = (const int32_t *)(uintptr_t)flow->trans;
 112
 113        addr = _M_I_(set1_epi32)(UINT8_MAX);
 114        root = _M_I_(set1_epi32)(flow->root_index);
 115
 116        addr = _M_SI_(and)(next_input, addr);
 117        addr = _M_I_(add_epi32)(root, addr);
 118
 119        /* load lower 32 bits of _N_ transactions at once. */
 120        *tr_lo = _M_MGI_(mask_i32gather_epi32)(*tr_lo, msk, addr, tr,
 121                sizeof(flow->trans[0]));
 122
 123        /* load high 32 bits of _N_ transactions at once. */
 124        *tr_hi = _M_MGI_(mask_i32gather_epi32)(*tr_hi, msk, addr, (tr + 1),
 125                sizeof(flow->trans[0]));
 126}
 127
 128/*
 129 * Load and return next 4 input bytes for up to _N_ flows in parallel.
 130 * pdata - 8x2 pointers to flow input data
 131 * mask - mask of active flows.
 132 * di - data indexes for these _N_ flows.
 133 */
 134static inline _T_simd
 135_F_(get_next_bytes)(const struct acl_flow_avx512 *flow, _T_simd pdata[2],
 136        uint32_t msk, _T_simd *di, uint32_t bnum)
 137{
 138        const int32_t *div;
 139        uint32_t m[2];
 140        _T_simd one, zero, t, p[2];
 141
 142        div = (const int32_t *)flow->data_index;
 143
 144        one = _M_I_(set1_epi32)(1);
 145        zero = _M_SI_(xor)(one, one);
 146
 147        /* load data offsets for given indexes */
 148        t = _M_MGI_(mask_i32gather_epi32)(zero, msk, *di, div, sizeof(div[0]));
 149
 150        /* increment data indexes */
 151        *di = _M_I_(mask_add_epi32)(*di, msk, *di, one);
 152
 153        /*
 154         * unsigned expand 32-bit indexes to 64-bit
 155         * (for later pointer arithmetic), i.e:
 156         * for (i = 0; i != _N_; i++)
 157         *   p[i/8].u64[i%8] = (uint64_t)t.u32[i];
 158         */
 159        p[0] = _M_I_(maskz_permutexvar_epi32)(_SC_(pmidx_msk), _SV_(pmidx[0]),
 160                        t);
 161        p[1] = _M_I_(maskz_permutexvar_epi32)(_SC_(pmidx_msk), _SV_(pmidx[1]),
 162                        t);
 163
 164        p[0] = _M_I_(add_epi64)(p[0], pdata[0]);
 165        p[1] = _M_I_(add_epi64)(p[1], pdata[1]);
 166
 167        /* load input byte(s), either one or four */
 168
 169        m[0] = msk & _SIMD_PTR_MSK_;
 170        m[1] = msk >> _SIMD_PTR_NUM_;
 171
 172        return _F_(gather_bytes)(zero, p, m, bnum);
 173}
 174
 175/*
 176 * Start up to _N_ new flows.
 177 * num - number of flows to start
 178 * msk - mask of new flows.
 179 * pdata - pointers to flow input data
 180 * idx - match indexed for given flows
 181 * di - data indexes for these flows.
 182 */
 183static inline void
 184_F_(start_flow)(struct acl_flow_avx512 *flow, uint32_t num, uint32_t msk,
 185        _T_simd pdata[2], _T_simd *idx, _T_simd *di)
 186{
 187        uint32_t n, m[2], nm[2];
 188        _T_simd ni, nd[2];
 189
 190        /* split mask into two - one for each pdata[] */
 191        m[0] = msk & _SIMD_PTR_MSK_;
 192        m[1] = msk >> _SIMD_PTR_NUM_;
 193
 194        /* calculate masks for new flows */
 195        n = __builtin_popcount(m[0]);
 196        nm[0] = (1 << n) - 1;
 197        nm[1] = (1 << (num - n)) - 1;
 198
 199        /* load input data pointers for new flows */
 200        nd[0] = _M_I_(maskz_loadu_epi64)(nm[0],
 201                        flow->idata + flow->num_packets);
 202        nd[1] = _M_I_(maskz_loadu_epi64)(nm[1],
 203                        flow->idata + flow->num_packets + n);
 204
 205        /* calculate match indexes of new flows */
 206        ni = _M_I_(set1_epi32)(flow->num_packets);
 207        ni = _M_I_(add_epi32)(ni, _SV_(idx_add));
 208
 209        /* merge new and existing flows data */
 210        pdata[0] = _M_I_(mask_expand_epi64)(pdata[0], m[0], nd[0]);
 211        pdata[1] = _M_I_(mask_expand_epi64)(pdata[1], m[1], nd[1]);
 212
 213        /* update match and data indexes */
 214        *idx = _M_I_(mask_expand_epi32)(*idx, msk, ni);
 215        *di = _M_I_(maskz_mov_epi32)(msk ^ _SIMD_MASK_MAX_, *di);
 216
 217        flow->num_packets += num;
 218}
 219
 220/*
 221 * Process found matches for up to _N_ flows.
 222 * fmsk - mask of active flows
 223 * rmsk - mask of found matches
 224 * pdata - pointers to flow input data
 225 * di - data indexes for these flows
 226 * idx - match indexed for given flows
 227 * tr_lo contains low 32 bits for up to _N_ transitions.
 228 * tr_hi contains high 32 bits for up to _N_ transitions.
 229 */
 230static inline uint32_t
 231_F_(match_process)(struct acl_flow_avx512 *flow, uint32_t *fmsk,
 232        uint32_t *rmsk, _T_simd pdata[2], _T_simd *di, _T_simd *idx,
 233        _T_simd *tr_lo, _T_simd *tr_hi)
 234{
 235        uint32_t n;
 236        _T_simd res;
 237
 238        if (rmsk[0] == 0)
 239                return 0;
 240
 241        /* extract match indexes */
 242        res = _M_SI_(and)(tr_lo[0], _SV_(index_mask));
 243
 244        /* mask  matched transitions to nop */
 245        tr_lo[0] = _M_I_(mask_mov_epi32)(tr_lo[0], rmsk[0], _SV_(trlo_idle));
 246        tr_hi[0] = _M_I_(mask_mov_epi32)(tr_hi[0], rmsk[0], _SV_(trhi_idle));
 247
 248        /* save found match indexes */
 249        _M_I_(mask_i32scatter_epi32)((void *)flow->matches, rmsk[0], idx[0],
 250                        res, sizeof(flow->matches[0]));
 251
 252        /* update masks and start new flows for matches */
 253        n = update_flow_mask(flow, fmsk, rmsk);
 254        _F_(start_flow)(flow, n, rmsk[0], pdata, idx, di);
 255
 256        return n;
 257}
 258
 259/*
 260 * Test for matches ut to (2 * _N_) flows at once,
 261 * if matches exist - process them and start new flows.
 262 */
 263static inline void
 264_F_(match_check_process)(struct acl_flow_avx512 *flow, uint32_t fm[2],
 265        _T_simd pdata[4], _T_simd di[2], _T_simd idx[2], _T_simd inp[2],
 266        _T_simd tr_lo[2], _T_simd tr_hi[2])
 267{
 268        uint32_t n[2];
 269        uint32_t rm[2];
 270
 271        /* check for matches */
 272        rm[0] = _M_I_(test_epi32_mask)(tr_lo[0], _SV_(match_mask));
 273        rm[1] = _M_I_(test_epi32_mask)(tr_lo[1], _SV_(match_mask));
 274
 275        /* till unprocessed matches exist */
 276        while ((rm[0] | rm[1]) != 0) {
 277
 278                /* process matches and start new flows */
 279                n[0] = _F_(match_process)(flow, &fm[0], &rm[0], &pdata[0],
 280                        &di[0], &idx[0], &tr_lo[0], &tr_hi[0]);
 281                n[1] = _F_(match_process)(flow, &fm[1], &rm[1], &pdata[2],
 282                        &di[1], &idx[1], &tr_lo[1], &tr_hi[1]);
 283
 284                /* execute first transition for new flows, if any */
 285
 286                if (n[0] != 0) {
 287                        inp[0] = _F_(get_next_bytes)(flow, &pdata[0],
 288                                        rm[0], &di[0], flow->first_load_sz);
 289                        _F_(first_trans)(flow, inp[0], rm[0], &tr_lo[0],
 290                                        &tr_hi[0]);
 291                        rm[0] = _M_I_(test_epi32_mask)(tr_lo[0],
 292                                        _SV_(match_mask));
 293                }
 294
 295                if (n[1] != 0) {
 296                        inp[1] = _F_(get_next_bytes)(flow, &pdata[2],
 297                                        rm[1], &di[1], flow->first_load_sz);
 298                        _F_(first_trans)(flow, inp[1], rm[1], &tr_lo[1],
 299                                        &tr_hi[1]);
 300                        rm[1] = _M_I_(test_epi32_mask)(tr_lo[1],
 301                                        _SV_(match_mask));
 302                }
 303        }
 304}
 305
 306static inline void
 307_F_(reset_flow_vars)(_T_simd di[2], _T_simd idx[2], _T_simd pdata[4],
 308        _T_simd tr_lo[2], _T_simd tr_hi[2])
 309{
 310        di[0] = _M_SI_(setzero)();
 311        di[1] = _M_SI_(setzero)();
 312
 313        idx[0] = _M_SI_(setzero)();
 314        idx[1] = _M_SI_(setzero)();
 315
 316        pdata[0] = _M_SI_(setzero)();
 317        pdata[1] = _M_SI_(setzero)();
 318        pdata[2] = _M_SI_(setzero)();
 319        pdata[3] = _M_SI_(setzero)();
 320
 321        tr_lo[0] = _M_SI_(setzero)();
 322        tr_lo[1] = _M_SI_(setzero)();
 323
 324        tr_hi[0] = _M_SI_(setzero)();
 325        tr_hi[1] = _M_SI_(setzero)();
 326}
 327
 328/*
 329 * Perform search for up to (2 * _N_) flows in parallel.
 330 * Use two sets of metadata, each serves _N_ flows max.
 331 */
 332static inline void
 333_F_(search_trie)(struct acl_flow_avx512 *flow)
 334{
 335        uint32_t fm[2];
 336        _T_simd di[2], idx[2], in[2], pdata[4], tr_lo[2], tr_hi[2];
 337
 338        _F_(reset_flow_vars)(di, idx, pdata, tr_lo, tr_hi);
 339
 340        /* first 1B load */
 341        _F_(start_flow)(flow, _SIMD_MASK_BIT_, _SIMD_MASK_MAX_,
 342                        &pdata[0], &idx[0], &di[0]);
 343        _F_(start_flow)(flow, _SIMD_MASK_BIT_, _SIMD_MASK_MAX_,
 344                        &pdata[2], &idx[1], &di[1]);
 345
 346        in[0] = _F_(get_next_bytes)(flow, &pdata[0], _SIMD_MASK_MAX_, &di[0],
 347                        flow->first_load_sz);
 348        in[1] = _F_(get_next_bytes)(flow, &pdata[2], _SIMD_MASK_MAX_, &di[1],
 349                        flow->first_load_sz);
 350
 351        _F_(first_trans)(flow, in[0], _SIMD_MASK_MAX_, &tr_lo[0], &tr_hi[0]);
 352        _F_(first_trans)(flow, in[1], _SIMD_MASK_MAX_, &tr_lo[1], &tr_hi[1]);
 353
 354        fm[0] = _SIMD_MASK_MAX_;
 355        fm[1] = _SIMD_MASK_MAX_;
 356
 357        /* match check */
 358        _F_(match_check_process)(flow, fm, pdata, di, idx, in, tr_lo, tr_hi);
 359
 360        while ((fm[0] | fm[1]) != 0) {
 361
 362                /* load next 4B */
 363
 364                in[0] = _F_(get_next_bytes)(flow, &pdata[0], fm[0],
 365                                &di[0], sizeof(uint32_t));
 366                in[1] = _F_(get_next_bytes)(flow, &pdata[2], fm[1],
 367                                &di[1], sizeof(uint32_t));
 368
 369                /* main 4B loop */
 370
 371                in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
 372                in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
 373
 374                in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
 375                in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
 376
 377                in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
 378                in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
 379
 380                in[0] = _F_(trans)(in[0], flow->trans, &tr_lo[0], &tr_hi[0]);
 381                in[1] = _F_(trans)(in[1], flow->trans, &tr_lo[1], &tr_hi[1]);
 382
 383                /* check for matches */
 384                _F_(match_check_process)(flow, fm, pdata, di, idx, in,
 385                        tr_lo, tr_hi);
 386        }
 387}
 388
 389/*
 390 * resolve match index to actual result/priority offset.
 391 */
 392static inline _T_simd
 393_F_(resolve_match_idx)(_T_simd mi)
 394{
 395        RTE_BUILD_BUG_ON(sizeof(struct rte_acl_match_results) !=
 396                1 << (ACL_MATCH_LOG + 2));
 397        return _M_I_(slli_epi32)(mi, ACL_MATCH_LOG);
 398}
 399
 400/*
 401 * Resolve multiple matches for the same flow based on priority.
 402 */
 403static inline _T_simd
 404_F_(resolve_pri)(const int32_t res[], const int32_t pri[],
 405        const uint32_t match[], _T_mask msk, uint32_t nb_trie,
 406        uint32_t nb_skip)
 407{
 408        uint32_t i;
 409        const uint32_t *pm;
 410        _T_mask m;
 411        _T_simd cp, cr, np, nr, mch;
 412
 413        const _T_simd zero = _M_I_(set1_epi32)(0);
 414
 415        /* get match indexes */
 416        mch = _M_I_(maskz_loadu_epi32)(msk, match);
 417        mch = _F_(resolve_match_idx)(mch);
 418
 419        /* read result and priority values for first trie */
 420        cr = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, res, sizeof(res[0]));
 421        cp = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, pri, sizeof(pri[0]));
 422
 423        /*
 424         * read result and priority values for next tries and select one
 425         * with highest priority.
 426         */
 427        for (i = 1, pm = match + nb_skip; i != nb_trie;
 428                        i++, pm += nb_skip) {
 429
 430                mch = _M_I_(maskz_loadu_epi32)(msk, pm);
 431                mch = _F_(resolve_match_idx)(mch);
 432
 433                nr = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, res,
 434                                sizeof(res[0]));
 435                np = _M_MGI_(mask_i32gather_epi32)(zero, msk, mch, pri,
 436                                sizeof(pri[0]));
 437
 438                m = _M_I_(cmpgt_epi32_mask)(cp, np);
 439                cr = _M_I_(mask_mov_epi32)(nr, m, cr);
 440                cp = _M_I_(mask_mov_epi32)(np, m, cp);
 441        }
 442
 443        return cr;
 444}
 445
 446/*
 447 * Resolve num (<= _N_) matches for single category
 448 */
 449static inline void
 450_F_(resolve_sc)(uint32_t result[], const int32_t res[],
 451        const int32_t pri[], const uint32_t match[], uint32_t nb_pkt,
 452        uint32_t nb_trie, uint32_t nb_skip)
 453{
 454        _T_mask msk;
 455        _T_simd cr;
 456
 457        msk = (1 << nb_pkt) - 1;
 458        cr = _F_(resolve_pri)(res, pri, match, msk, nb_trie, nb_skip);
 459        _M_I_(mask_storeu_epi32)(result, msk, cr);
 460}
 461
 462/*
 463 * Resolve matches for single category
 464 */
 465static inline void
 466_F_(resolve_single_cat)(uint32_t result[],
 467        const struct rte_acl_match_results pr[], const uint32_t match[],
 468        uint32_t nb_pkt, uint32_t nb_trie)
 469{
 470        uint32_t j, k, n;
 471        const int32_t *res, *pri;
 472        _T_simd cr[2];
 473
 474        res = (const int32_t *)pr->results;
 475        pri = pr->priority;
 476
 477        for (k = 0; k != (nb_pkt & ~_SIMD_FLOW_MSK_); k += _SIMD_FLOW_NUM_) {
 478
 479                j = k + _SIMD_MASK_BIT_;
 480
 481                cr[0] = _F_(resolve_pri)(res, pri, match + k, _SIMD_MASK_MAX_,
 482                                nb_trie, nb_pkt);
 483                cr[1] = _F_(resolve_pri)(res, pri, match + j, _SIMD_MASK_MAX_,
 484                                nb_trie, nb_pkt);
 485
 486                _M_SI_(storeu)((void *)(result + k), cr[0]);
 487                _M_SI_(storeu)((void *)(result + j), cr[1]);
 488        }
 489
 490        n = nb_pkt - k;
 491        if (n != 0) {
 492                if (n > _SIMD_MASK_BIT_) {
 493                        _F_(resolve_sc)(result + k, res, pri, match + k,
 494                                _SIMD_MASK_BIT_, nb_trie, nb_pkt);
 495                        k += _SIMD_MASK_BIT_;
 496                        n -= _SIMD_MASK_BIT_;
 497                }
 498                _F_(resolve_sc)(result + k, res, pri, match + k, n,
 499                                nb_trie, nb_pkt);
 500        }
 501}
 502