dpdk/lib/acl/acl_run_avx2.h
<<
>>
Prefs
   1/* SPDX-License-Identifier: BSD-3-Clause
   2 * Copyright(c) 2010-2014 Intel Corporation
   3 */
   4
   5#include "acl_run_sse.h"
   6
   7static const rte_ymm_t ymm_match_mask = {
   8        .u32 = {
   9                RTE_ACL_NODE_MATCH,
  10                RTE_ACL_NODE_MATCH,
  11                RTE_ACL_NODE_MATCH,
  12                RTE_ACL_NODE_MATCH,
  13                RTE_ACL_NODE_MATCH,
  14                RTE_ACL_NODE_MATCH,
  15                RTE_ACL_NODE_MATCH,
  16                RTE_ACL_NODE_MATCH,
  17        },
  18};
  19
  20static const rte_ymm_t ymm_index_mask = {
  21        .u32 = {
  22                RTE_ACL_NODE_INDEX,
  23                RTE_ACL_NODE_INDEX,
  24                RTE_ACL_NODE_INDEX,
  25                RTE_ACL_NODE_INDEX,
  26                RTE_ACL_NODE_INDEX,
  27                RTE_ACL_NODE_INDEX,
  28                RTE_ACL_NODE_INDEX,
  29                RTE_ACL_NODE_INDEX,
  30        },
  31};
  32
  33static const rte_ymm_t ymm_shuffle_input = {
  34        .u32 = {
  35                0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
  36                0x00000000, 0x04040404, 0x08080808, 0x0c0c0c0c,
  37        },
  38};
  39
  40static const rte_ymm_t ymm_ones_16 = {
  41        .u16 = {
  42                1, 1, 1, 1, 1, 1, 1, 1,
  43                1, 1, 1, 1, 1, 1, 1, 1,
  44        },
  45};
  46
  47static const rte_ymm_t ymm_range_base = {
  48        .u32 = {
  49                0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
  50                0xffffff00, 0xffffff04, 0xffffff08, 0xffffff0c,
  51        },
  52};
  53
  54/*
  55 * Process 8 transitions in parallel.
  56 * tr_lo contains low 32 bits for 8 transition.
  57 * tr_hi contains high 32 bits for 8 transition.
  58 * next_input contains up to 4 input bytes for 8 flows.
  59 */
  60static __rte_always_inline ymm_t
  61transition8(ymm_t next_input, const uint64_t *trans, ymm_t *tr_lo, ymm_t *tr_hi)
  62{
  63        const int32_t *tr;
  64        ymm_t addr;
  65
  66        tr = (const int32_t *)(uintptr_t)trans;
  67
  68        /* Calculate the address (array index) for all 8 transitions. */
  69        ACL_TR_CALC_ADDR(mm256, 256, addr, ymm_index_mask.y, next_input,
  70                ymm_shuffle_input.y, ymm_ones_16.y, ymm_range_base.y,
  71                *tr_lo, *tr_hi);
  72
  73        /* load lower 32 bits of 8 transactions at once. */
  74        *tr_lo = _mm256_i32gather_epi32(tr, addr, sizeof(trans[0]));
  75
  76        next_input = _mm256_srli_epi32(next_input, CHAR_BIT);
  77
  78        /* load high 32 bits of 8 transactions at once. */
  79        *tr_hi = _mm256_i32gather_epi32(tr + 1, addr, sizeof(trans[0]));
  80
  81        return next_input;
  82}
  83
  84/*
  85 * Process matches for  8 flows.
  86 * tr_lo contains low 32 bits for 8 transition.
  87 * tr_hi contains high 32 bits for 8 transition.
  88 */
  89static inline void
  90acl_process_matches_avx2x8(const struct rte_acl_ctx *ctx,
  91        struct parms *parms, struct acl_flow_data *flows, uint32_t slot,
  92        ymm_t matches, ymm_t *tr_lo, ymm_t *tr_hi)
  93{
  94        ymm_t t0, t1;
  95        ymm_t lo, hi;
  96        xmm_t l0, l1;
  97        uint32_t i;
  98        uint64_t tr[MAX_SEARCHES_SSE8];
  99
 100        l1 = _mm256_extracti128_si256(*tr_lo, 1);
 101        l0 = _mm256_castsi256_si128(*tr_lo);
 102
 103        for (i = 0; i != RTE_DIM(tr) / 2; i++) {
 104
 105                /*
 106                 * Extract low 32bits of each transition.
 107                 * That's enough to process the match.
 108                 */
 109                tr[i] = (uint32_t)_mm_cvtsi128_si32(l0);
 110                tr[i + 4] = (uint32_t)_mm_cvtsi128_si32(l1);
 111
 112                l0 = _mm_srli_si128(l0, sizeof(uint32_t));
 113                l1 = _mm_srli_si128(l1, sizeof(uint32_t));
 114
 115                tr[i] = acl_match_check(tr[i], slot + i,
 116                        ctx, parms, flows, resolve_priority_sse);
 117                tr[i + 4] = acl_match_check(tr[i + 4], slot + i + 4,
 118                        ctx, parms, flows, resolve_priority_sse);
 119        }
 120
 121        /* Collect new transitions into 2 YMM registers. */
 122        t0 = _mm256_set_epi64x(tr[5], tr[4], tr[1], tr[0]);
 123        t1 = _mm256_set_epi64x(tr[7], tr[6], tr[3], tr[2]);
 124
 125        /* For each transition: put low 32 into tr_lo and high 32 into tr_hi */
 126        ACL_TR_HILO(mm256, __m256, t0, t1, lo, hi);
 127
 128        /* Keep transitions with NOMATCH intact. */
 129        *tr_lo = _mm256_blendv_epi8(*tr_lo, lo, matches);
 130        *tr_hi = _mm256_blendv_epi8(*tr_hi, hi, matches);
 131}
 132
 133static inline void
 134acl_match_check_avx2x8(const struct rte_acl_ctx *ctx, struct parms *parms,
 135        struct acl_flow_data *flows, uint32_t slot,
 136        ymm_t *tr_lo, ymm_t *tr_hi, ymm_t match_mask)
 137{
 138        uint32_t msk;
 139        ymm_t matches, temp;
 140
 141        /* test for match node */
 142        temp = _mm256_and_si256(match_mask, *tr_lo);
 143        matches = _mm256_cmpeq_epi32(temp, match_mask);
 144        msk = _mm256_movemask_epi8(matches);
 145
 146        while (msk != 0) {
 147
 148                acl_process_matches_avx2x8(ctx, parms, flows, slot,
 149                        matches, tr_lo, tr_hi);
 150                temp = _mm256_and_si256(match_mask, *tr_lo);
 151                matches = _mm256_cmpeq_epi32(temp, match_mask);
 152                msk = _mm256_movemask_epi8(matches);
 153        }
 154}
 155
 156/*
 157 * Execute trie traversal for up to 16 flows in parallel.
 158 */
 159static inline int
 160search_avx2x16(const struct rte_acl_ctx *ctx, const uint8_t **data,
 161        uint32_t *results, uint32_t total_packets, uint32_t categories)
 162{
 163        uint32_t n;
 164        struct acl_flow_data flows;
 165        uint64_t index_array[MAX_SEARCHES_AVX16];
 166        struct completion cmplt[MAX_SEARCHES_AVX16];
 167        struct parms parms[MAX_SEARCHES_AVX16];
 168        ymm_t input[2], tr_lo[2], tr_hi[2];
 169        ymm_t t0, t1;
 170
 171        acl_set_flow(&flows, cmplt, RTE_DIM(cmplt), data, results,
 172                total_packets, categories, ctx->trans_table);
 173
 174        for (n = 0; n < RTE_DIM(cmplt); n++) {
 175                cmplt[n].count = 0;
 176                index_array[n] = acl_start_next_trie(&flows, parms, n, ctx);
 177        }
 178
 179        t0 = _mm256_set_epi64x(index_array[5], index_array[4],
 180                index_array[1], index_array[0]);
 181        t1 = _mm256_set_epi64x(index_array[7], index_array[6],
 182                index_array[3], index_array[2]);
 183
 184        ACL_TR_HILO(mm256, __m256, t0, t1, tr_lo[0], tr_hi[0]);
 185
 186        t0 = _mm256_set_epi64x(index_array[13], index_array[12],
 187                index_array[9], index_array[8]);
 188        t1 = _mm256_set_epi64x(index_array[15], index_array[14],
 189                index_array[11], index_array[10]);
 190
 191        ACL_TR_HILO(mm256, __m256, t0, t1, tr_lo[1], tr_hi[1]);
 192
 193         /* Check for any matches. */
 194        acl_match_check_avx2x8(ctx, parms, &flows, 0, &tr_lo[0], &tr_hi[0],
 195                ymm_match_mask.y);
 196        acl_match_check_avx2x8(ctx, parms, &flows, 8, &tr_lo[1], &tr_hi[1],
 197                ymm_match_mask.y);
 198
 199        while (flows.started > 0) {
 200
 201                uint32_t in[MAX_SEARCHES_SSE8];
 202
 203                /* Gather 4 bytes of input data for first 8 flows. */
 204                in[0] = GET_NEXT_4BYTES(parms, 0);
 205                in[4] = GET_NEXT_4BYTES(parms, 4);
 206                in[1] = GET_NEXT_4BYTES(parms, 1);
 207                in[5] = GET_NEXT_4BYTES(parms, 5);
 208                in[2] = GET_NEXT_4BYTES(parms, 2);
 209                in[6] = GET_NEXT_4BYTES(parms, 6);
 210                in[3] = GET_NEXT_4BYTES(parms, 3);
 211                in[7] = GET_NEXT_4BYTES(parms, 7);
 212                input[0] = _mm256_set_epi32(in[7], in[6], in[5], in[4],
 213                        in[3], in[2], in[1], in[0]);
 214
 215                /* Gather 4 bytes of input data for last 8 flows. */
 216                in[0] = GET_NEXT_4BYTES(parms, 8);
 217                in[4] = GET_NEXT_4BYTES(parms, 12);
 218                in[1] = GET_NEXT_4BYTES(parms, 9);
 219                in[5] = GET_NEXT_4BYTES(parms, 13);
 220                in[2] = GET_NEXT_4BYTES(parms, 10);
 221                in[6] = GET_NEXT_4BYTES(parms, 14);
 222                in[3] = GET_NEXT_4BYTES(parms, 11);
 223                in[7] = GET_NEXT_4BYTES(parms, 15);
 224                input[1] = _mm256_set_epi32(in[7], in[6], in[5], in[4],
 225                        in[3], in[2], in[1], in[0]);
 226
 227                input[0] = transition8(input[0], flows.trans,
 228                        &tr_lo[0], &tr_hi[0]);
 229                input[1] = transition8(input[1], flows.trans,
 230                        &tr_lo[1], &tr_hi[1]);
 231
 232                input[0] = transition8(input[0], flows.trans,
 233                        &tr_lo[0], &tr_hi[0]);
 234                input[1] = transition8(input[1], flows.trans,
 235                        &tr_lo[1], &tr_hi[1]);
 236
 237                input[0] = transition8(input[0], flows.trans,
 238                        &tr_lo[0], &tr_hi[0]);
 239                input[1] = transition8(input[1], flows.trans,
 240                        &tr_lo[1], &tr_hi[1]);
 241
 242                input[0] = transition8(input[0], flows.trans,
 243                        &tr_lo[0], &tr_hi[0]);
 244                input[1] = transition8(input[1], flows.trans,
 245                        &tr_lo[1], &tr_hi[1]);
 246
 247                 /* Check for any matches. */
 248                acl_match_check_avx2x8(ctx, parms, &flows, 0,
 249                        &tr_lo[0], &tr_hi[0], ymm_match_mask.y);
 250                acl_match_check_avx2x8(ctx, parms, &flows, 8,
 251                        &tr_lo[1], &tr_hi[1], ymm_match_mask.y);
 252        }
 253
 254        return 0;
 255}
 256