linux/net/netfilter/nft_set_pipapo_avx2.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2
   3/* PIPAPO: PIle PAcket POlicies: AVX2 packet lookup routines
   4 *
   5 * Copyright (c) 2019-2020 Red Hat GmbH
   6 *
   7 * Author: Stefano Brivio <sbrivio@redhat.com>
   8 */
   9
  10#include <linux/kernel.h>
  11#include <linux/init.h>
  12#include <linux/module.h>
  13#include <linux/netlink.h>
  14#include <linux/netfilter.h>
  15#include <linux/netfilter/nf_tables.h>
  16#include <net/netfilter/nf_tables_core.h>
  17#include <uapi/linux/netfilter/nf_tables.h>
  18#include <linux/bitmap.h>
  19#include <linux/bitops.h>
  20
  21#include <linux/compiler.h>
  22#include <asm/fpu/api.h>
  23
  24#include "nft_set_pipapo_avx2.h"
  25#include "nft_set_pipapo.h"
  26
  27#define NFT_PIPAPO_LONGS_PER_M256       (XSAVE_YMM_SIZE / BITS_PER_LONG)
  28
  29/* Load from memory into YMM register with non-temporal hint ("stream load"),
  30 * that is, don't fetch lines from memory into the cache. This avoids pushing
  31 * precious packet data out of the cache hierarchy, and is appropriate when:
  32 *
  33 * - loading buckets from lookup tables, as they are not going to be used
  34 *   again before packets are entirely classified
  35 *
  36 * - loading the result bitmap from the previous field, as it's never used
  37 *   again
  38 */
  39#define NFT_PIPAPO_AVX2_LOAD(reg, loc)                                  \
  40        asm volatile("vmovntdqa %0, %%ymm" #reg : : "m" (loc))
  41
  42/* Stream a single lookup table bucket into YMM register given lookup table,
  43 * group index, value of packet bits, bucket size.
  44 */
  45#define NFT_PIPAPO_AVX2_BUCKET_LOAD4(reg, lt, group, v, bsize)          \
  46        NFT_PIPAPO_AVX2_LOAD(reg,                                       \
  47                             lt[((group) * NFT_PIPAPO_BUCKETS(4) +      \
  48                                 (v)) * (bsize)])
  49#define NFT_PIPAPO_AVX2_BUCKET_LOAD8(reg, lt, group, v, bsize)          \
  50        NFT_PIPAPO_AVX2_LOAD(reg,                                       \
  51                             lt[((group) * NFT_PIPAPO_BUCKETS(8) +      \
  52                                 (v)) * (bsize)])
  53
  54/* Bitwise AND: the staple operation of this algorithm */
  55#define NFT_PIPAPO_AVX2_AND(dst, a, b)                                  \
  56        asm volatile("vpand %ymm" #a ", %ymm" #b ", %ymm" #dst)
  57
  58/* Jump to label if @reg is zero */
  59#define NFT_PIPAPO_AVX2_NOMATCH_GOTO(reg, label)                        \
  60        asm_volatile_goto("vptest %%ymm" #reg ", %%ymm" #reg ";"        \
  61                          "je %l[" #label "]" : : : : label)
  62
  63/* Store 256 bits from YMM register into memory. Contrary to bucket load
  64 * operation, we don't bypass the cache here, as stored matching results
  65 * are always used shortly after.
  66 */
  67#define NFT_PIPAPO_AVX2_STORE(loc, reg)                                 \
  68        asm volatile("vmovdqa %%ymm" #reg ", %0" : "=m" (loc))
  69
  70/* Zero out a complete YMM register, @reg */
  71#define NFT_PIPAPO_AVX2_ZERO(reg)                                       \
  72        asm volatile("vpxor %ymm" #reg ", %ymm" #reg ", %ymm" #reg)
  73
  74/* Current working bitmap index, toggled between field matches */
  75static DEFINE_PER_CPU(bool, nft_pipapo_avx2_scratch_index);
  76
  77/**
  78 * nft_pipapo_avx2_prepare() - Prepare before main algorithm body
  79 *
  80 * This zeroes out ymm15, which is later used whenever we need to clear a
  81 * memory location, by storing its content into memory.
  82 */
  83static void nft_pipapo_avx2_prepare(void)
  84{
  85        NFT_PIPAPO_AVX2_ZERO(15);
  86}
  87
  88/**
  89 * nft_pipapo_avx2_fill() - Fill a bitmap region with ones
  90 * @data:       Base memory area
  91 * @start:      First bit to set
  92 * @len:        Count of bits to fill
  93 *
  94 * This is nothing else than a version of bitmap_set(), as used e.g. by
  95 * pipapo_refill(), tailored for the microarchitectures using it and better
  96 * suited for the specific usage: it's very likely that we'll set a small number
  97 * of bits, not crossing a word boundary, and correct branch prediction is
  98 * critical here.
  99 *
 100 * This function doesn't actually use any AVX2 instruction.
 101 */
 102static void nft_pipapo_avx2_fill(unsigned long *data, int start, int len)
 103{
 104        int offset = start % BITS_PER_LONG;
 105        unsigned long mask;
 106
 107        data += start / BITS_PER_LONG;
 108
 109        if (likely(len == 1)) {
 110                *data |= BIT(offset);
 111                return;
 112        }
 113
 114        if (likely(len < BITS_PER_LONG || offset)) {
 115                if (likely(len + offset <= BITS_PER_LONG)) {
 116                        *data |= GENMASK(len - 1 + offset, offset);
 117                        return;
 118                }
 119
 120                *data |= ~0UL << offset;
 121                len -= BITS_PER_LONG - offset;
 122                data++;
 123
 124                if (len <= BITS_PER_LONG) {
 125                        mask = ~0UL >> (BITS_PER_LONG - len);
 126                        *data |= mask;
 127                        return;
 128                }
 129        }
 130
 131        memset(data, 0xff, len / BITS_PER_BYTE);
 132        data += len / BITS_PER_LONG;
 133
 134        len %= BITS_PER_LONG;
 135        if (len)
 136                *data |= ~0UL >> (BITS_PER_LONG - len);
 137}
 138
 139/**
 140 * nft_pipapo_avx2_refill() - Scan bitmap, select mapping table item, set bits
 141 * @offset:     Start from given bitmap (equivalent to bucket) offset, in longs
 142 * @map:        Bitmap to be scanned for set bits
 143 * @dst:        Destination bitmap
 144 * @mt:         Mapping table containing bit set specifiers
 145 * @len:        Length of bitmap in longs
 146 * @last:       Return index of first set bit, if this is the last field
 147 *
 148 * This is an alternative implementation of pipapo_refill() suitable for usage
 149 * with AVX2 lookup routines: we know there are four words to be scanned, at
 150 * a given offset inside the map, for each matching iteration.
 151 *
 152 * This function doesn't actually use any AVX2 instruction.
 153 *
 154 * Return: first set bit index if @last, index of first filled word otherwise.
 155 */
 156static int nft_pipapo_avx2_refill(int offset, unsigned long *map,
 157                                  unsigned long *dst,
 158                                  union nft_pipapo_map_bucket *mt, bool last)
 159{
 160        int ret = -1;
 161
 162#define NFT_PIPAPO_AVX2_REFILL_ONE_WORD(x)                              \
 163        do {                                                            \
 164                while (map[(x)]) {                                      \
 165                        int r = __builtin_ctzl(map[(x)]);               \
 166                        int i = (offset + (x)) * BITS_PER_LONG + r;     \
 167                                                                        \
 168                        if (last)                                       \
 169                                return i;                               \
 170                                                                        \
 171                        nft_pipapo_avx2_fill(dst, mt[i].to, mt[i].n);   \
 172                                                                        \
 173                        if (ret == -1)                                  \
 174                                ret = mt[i].to;                         \
 175                                                                        \
 176                        map[(x)] &= ~(1UL << r);                        \
 177                }                                                       \
 178        } while (0)
 179
 180        NFT_PIPAPO_AVX2_REFILL_ONE_WORD(0);
 181        NFT_PIPAPO_AVX2_REFILL_ONE_WORD(1);
 182        NFT_PIPAPO_AVX2_REFILL_ONE_WORD(2);
 183        NFT_PIPAPO_AVX2_REFILL_ONE_WORD(3);
 184#undef NFT_PIPAPO_AVX2_REFILL_ONE_WORD
 185
 186        return ret;
 187}
 188
 189/**
 190 * nft_pipapo_avx2_lookup_4b_2() - AVX2-based lookup for 2 four-bit groups
 191 * @map:        Previous match result, used as initial bitmap
 192 * @fill:       Destination bitmap to be filled with current match result
 193 * @f:          Field, containing lookup and mapping tables
 194 * @offset:     Ignore buckets before the given index, no bits are filled there
 195 * @pkt:        Packet data, pointer to input nftables register
 196 * @first:      If this is the first field, don't source previous result
 197 * @last:       Last field: stop at the first match and return bit index
 198 *
 199 * Load buckets from lookup table corresponding to the values of each 4-bit
 200 * group of packet bytes, and perform a bitwise intersection between them. If
 201 * this is the first field in the set, simply AND the buckets together
 202 * (equivalent to using an all-ones starting bitmap), use the provided starting
 203 * bitmap otherwise. Then call nft_pipapo_avx2_refill() to generate the next
 204 * working bitmap, @fill.
 205 *
 206 * This is used for 8-bit fields (i.e. protocol numbers).
 207 *
 208 * Out-of-order (and superscalar) execution is vital here, so it's critical to
 209 * avoid false data dependencies. CPU and compiler could (mostly) take care of
 210 * this on their own, but the operation ordering is explicitly given here with
 211 * a likely execution order in mind, to highlight possible stalls. That's why
 212 * a number of logically distinct operations (i.e. loading buckets, intersecting
 213 * buckets) are interleaved.
 214 *
 215 * Return: -1 on no match, rule index of match if @last, otherwise first long
 216 * word index to be checked next (i.e. first filled word).
 217 */
 218static int nft_pipapo_avx2_lookup_4b_2(unsigned long *map, unsigned long *fill,
 219                                       struct nft_pipapo_field *f, int offset,
 220                                       const u8 *pkt, bool first, bool last)
 221{
 222        int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
 223        u8 pg[2] = { pkt[0] >> 4, pkt[0] & 0xf };
 224        unsigned long *lt = f->lt, bsize = f->bsize;
 225
 226        lt += offset * NFT_PIPAPO_LONGS_PER_M256;
 227        for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
 228                int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
 229
 230                if (first) {
 231                        NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
 232                        NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
 233                        NFT_PIPAPO_AVX2_AND(4, 0, 1);
 234                } else {
 235                        NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
 236                        NFT_PIPAPO_AVX2_LOAD(2, map[i_ul]);
 237                        NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
 238                        NFT_PIPAPO_AVX2_NOMATCH_GOTO(2, nothing);
 239                        NFT_PIPAPO_AVX2_AND(3, 0, 1);
 240                        NFT_PIPAPO_AVX2_AND(4, 2, 3);
 241                }
 242
 243                NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch);
 244                NFT_PIPAPO_AVX2_STORE(map[i_ul], 4);
 245
 246                b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
 247                if (last)
 248                        return b;
 249
 250                if (unlikely(ret == -1))
 251                        ret = b / XSAVE_YMM_SIZE;
 252
 253                continue;
 254nomatch:
 255                NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
 256nothing:
 257                ;
 258        }
 259
 260        return ret;
 261}
 262
 263/**
 264 * nft_pipapo_avx2_lookup_4b_4() - AVX2-based lookup for 4 four-bit groups
 265 * @map:        Previous match result, used as initial bitmap
 266 * @fill:       Destination bitmap to be filled with current match result
 267 * @f:          Field, containing lookup and mapping tables
 268 * @offset:     Ignore buckets before the given index, no bits are filled there
 269 * @pkt:        Packet data, pointer to input nftables register
 270 * @first:      If this is the first field, don't source previous result
 271 * @last:       Last field: stop at the first match and return bit index
 272 *
 273 * See nft_pipapo_avx2_lookup_4b_2().
 274 *
 275 * This is used for 16-bit fields (i.e. ports).
 276 *
 277 * Return: -1 on no match, rule index of match if @last, otherwise first long
 278 * word index to be checked next (i.e. first filled word).
 279 */
 280static int nft_pipapo_avx2_lookup_4b_4(unsigned long *map, unsigned long *fill,
 281                                       struct nft_pipapo_field *f, int offset,
 282                                       const u8 *pkt, bool first, bool last)
 283{
 284        int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
 285        u8 pg[4] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf };
 286        unsigned long *lt = f->lt, bsize = f->bsize;
 287
 288        lt += offset * NFT_PIPAPO_LONGS_PER_M256;
 289        for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
 290                int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
 291
 292                if (first) {
 293                        NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
 294                        NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
 295                        NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 2, pg[2], bsize);
 296                        NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 3, pg[3], bsize);
 297                        NFT_PIPAPO_AVX2_AND(4, 0, 1);
 298                        NFT_PIPAPO_AVX2_AND(5, 2, 3);
 299                        NFT_PIPAPO_AVX2_AND(7, 4, 5);
 300                } else {
 301                        NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
 302
 303                        NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
 304
 305                        NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize);
 306                        NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize);
 307                        NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize);
 308                        NFT_PIPAPO_AVX2_AND(5, 0, 1);
 309
 310                        NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
 311
 312                        NFT_PIPAPO_AVX2_AND(6, 2, 3);
 313                        NFT_PIPAPO_AVX2_AND(7, 4, 5);
 314                        /* Stall */
 315                        NFT_PIPAPO_AVX2_AND(7, 6, 7);
 316                }
 317
 318                /* Stall */
 319                NFT_PIPAPO_AVX2_NOMATCH_GOTO(7, nomatch);
 320                NFT_PIPAPO_AVX2_STORE(map[i_ul], 7);
 321
 322                b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
 323                if (last)
 324                        return b;
 325
 326                if (unlikely(ret == -1))
 327                        ret = b / XSAVE_YMM_SIZE;
 328
 329                continue;
 330nomatch:
 331                NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
 332nothing:
 333                ;
 334        }
 335
 336        return ret;
 337}
 338
 339/**
 340 * nft_pipapo_avx2_lookup_4b_8() - AVX2-based lookup for 8 four-bit groups
 341 * @map:        Previous match result, used as initial bitmap
 342 * @fill:       Destination bitmap to be filled with current match result
 343 * @f:          Field, containing lookup and mapping tables
 344 * @offset:     Ignore buckets before the given index, no bits are filled there
 345 * @pkt:        Packet data, pointer to input nftables register
 346 * @first:      If this is the first field, don't source previous result
 347 * @last:       Last field: stop at the first match and return bit index
 348 *
 349 * See nft_pipapo_avx2_lookup_4b_2().
 350 *
 351 * This is used for 32-bit fields (i.e. IPv4 addresses).
 352 *
 353 * Return: -1 on no match, rule index of match if @last, otherwise first long
 354 * word index to be checked next (i.e. first filled word).
 355 */
 356static int nft_pipapo_avx2_lookup_4b_8(unsigned long *map, unsigned long *fill,
 357                                       struct nft_pipapo_field *f, int offset,
 358                                       const u8 *pkt, bool first, bool last)
 359{
 360        u8 pg[8] = {  pkt[0] >> 4,  pkt[0] & 0xf,  pkt[1] >> 4,  pkt[1] & 0xf,
 361                      pkt[2] >> 4,  pkt[2] & 0xf,  pkt[3] >> 4,  pkt[3] & 0xf,
 362                   };
 363        int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
 364        unsigned long *lt = f->lt, bsize = f->bsize;
 365
 366        lt += offset * NFT_PIPAPO_LONGS_PER_M256;
 367        for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
 368                int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
 369
 370                if (first) {
 371                        NFT_PIPAPO_AVX2_BUCKET_LOAD4(0,  lt, 0, pg[0], bsize);
 372                        NFT_PIPAPO_AVX2_BUCKET_LOAD4(1,  lt, 1, pg[1], bsize);
 373                        NFT_PIPAPO_AVX2_BUCKET_LOAD4(2,  lt, 2, pg[2], bsize);
 374                        NFT_PIPAPO_AVX2_BUCKET_LOAD4(3,  lt, 3, pg[3], bsize);
 375                        NFT_PIPAPO_AVX2_BUCKET_LOAD4(4,  lt, 4, pg[4], bsize);
 376                        NFT_PIPAPO_AVX2_AND(5,   0,  1);
 377                        NFT_PIPAPO_AVX2_BUCKET_LOAD4(6,  lt, 5, pg[5], bsize);
 378                        NFT_PIPAPO_AVX2_BUCKET_LOAD4(7,  lt, 6, pg[6], bsize);
 379                        NFT_PIPAPO_AVX2_AND(8,   2,  3);
 380                        NFT_PIPAPO_AVX2_AND(9,   4,  5);
 381                        NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 7, pg[7], bsize);
 382                        NFT_PIPAPO_AVX2_AND(11,  6,  7);
 383                        NFT_PIPAPO_AVX2_AND(12,  8,  9);
 384                        NFT_PIPAPO_AVX2_AND(13, 10, 11);
 385
 386                        /* Stall */
 387                        NFT_PIPAPO_AVX2_AND(1,  12, 13);
 388                } else {
 389                        NFT_PIPAPO_AVX2_BUCKET_LOAD4(0,  lt, 0, pg[0], bsize);
 390                        NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
 391                        NFT_PIPAPO_AVX2_BUCKET_LOAD4(2,  lt, 1, pg[1], bsize);
 392                        NFT_PIPAPO_AVX2_BUCKET_LOAD4(3,  lt, 2, pg[2], bsize);
 393                        NFT_PIPAPO_AVX2_BUCKET_LOAD4(4,  lt, 3, pg[3], bsize);
 394
 395                        NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
 396
 397                        NFT_PIPAPO_AVX2_AND(5,   0,  1);
 398                        NFT_PIPAPO_AVX2_BUCKET_LOAD4(6,  lt, 4, pg[4], bsize);
 399                        NFT_PIPAPO_AVX2_BUCKET_LOAD4(7,  lt, 5, pg[5], bsize);
 400                        NFT_PIPAPO_AVX2_AND(8,   2,  3);
 401                        NFT_PIPAPO_AVX2_BUCKET_LOAD4(9,  lt, 6, pg[6], bsize);
 402                        NFT_PIPAPO_AVX2_AND(10,  4,  5);
 403                        NFT_PIPAPO_AVX2_BUCKET_LOAD4(11, lt, 7, pg[7], bsize);
 404                        NFT_PIPAPO_AVX2_AND(12,  6,  7);
 405                        NFT_PIPAPO_AVX2_AND(13,  8,  9);
 406                        NFT_PIPAPO_AVX2_AND(14, 10, 11);
 407
 408                        /* Stall */
 409                        NFT_PIPAPO_AVX2_AND(1,  12, 13);
 410                        NFT_PIPAPO_AVX2_AND(1,   1, 14);
 411                }
 412
 413                NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nomatch);
 414                NFT_PIPAPO_AVX2_STORE(map[i_ul], 1);
 415
 416                b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
 417                if (last)
 418                        return b;
 419
 420                if (unlikely(ret == -1))
 421                        ret = b / XSAVE_YMM_SIZE;
 422
 423                continue;
 424
 425nomatch:
 426                NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
 427nothing:
 428                ;
 429        }
 430
 431        return ret;
 432}
 433
 434/**
 435 * nft_pipapo_avx2_lookup_4b_12() - AVX2-based lookup for 12 four-bit groups
 436 * @map:        Previous match result, used as initial bitmap
 437 * @fill:       Destination bitmap to be filled with current match result
 438 * @f:          Field, containing lookup and mapping tables
 439 * @offset:     Ignore buckets before the given index, no bits are filled there
 440 * @pkt:        Packet data, pointer to input nftables register
 441 * @first:      If this is the first field, don't source previous result
 442 * @last:       Last field: stop at the first match and return bit index
 443 *
 444 * See nft_pipapo_avx2_lookup_4b_2().
 445 *
 446 * This is used for 48-bit fields (i.e. MAC addresses/EUI-48).
 447 *
 448 * Return: -1 on no match, rule index of match if @last, otherwise first long
 449 * word index to be checked next (i.e. first filled word).
 450 */
 451static int nft_pipapo_avx2_lookup_4b_12(unsigned long *map, unsigned long *fill,
 452                                        struct nft_pipapo_field *f, int offset,
 453                                        const u8 *pkt, bool first, bool last)
 454{
 455        u8 pg[12] = {  pkt[0] >> 4,  pkt[0] & 0xf,  pkt[1] >> 4,  pkt[1] & 0xf,
 456                       pkt[2] >> 4,  pkt[2] & 0xf,  pkt[3] >> 4,  pkt[3] & 0xf,
 457                       pkt[4] >> 4,  pkt[4] & 0xf,  pkt[5] >> 4,  pkt[5] & 0xf,
 458                    };
 459        int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
 460        unsigned long *lt = f->lt, bsize = f->bsize;
 461
 462        lt += offset * NFT_PIPAPO_LONGS_PER_M256;
 463        for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
 464                int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
 465
 466                if (!first)
 467                        NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
 468
 469                NFT_PIPAPO_AVX2_BUCKET_LOAD4(1,  lt,  0,  pg[0], bsize);
 470                NFT_PIPAPO_AVX2_BUCKET_LOAD4(2,  lt,  1,  pg[1], bsize);
 471                NFT_PIPAPO_AVX2_BUCKET_LOAD4(3,  lt,  2,  pg[2], bsize);
 472
 473                if (!first) {
 474                        NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
 475                        NFT_PIPAPO_AVX2_AND(1, 1, 0);
 476                }
 477
 478                NFT_PIPAPO_AVX2_BUCKET_LOAD4(4,  lt,  3,  pg[3], bsize);
 479                NFT_PIPAPO_AVX2_BUCKET_LOAD4(5,  lt,  4,  pg[4], bsize);
 480                NFT_PIPAPO_AVX2_AND(6,   2,  3);
 481                NFT_PIPAPO_AVX2_BUCKET_LOAD4(7,  lt,  5,  pg[5], bsize);
 482                NFT_PIPAPO_AVX2_BUCKET_LOAD4(8,  lt,  6,  pg[6], bsize);
 483                NFT_PIPAPO_AVX2_AND(9,   1,  4);
 484                NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt,  7,  pg[7], bsize);
 485                NFT_PIPAPO_AVX2_AND(11,  5,  6);
 486                NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt,  8,  pg[8], bsize);
 487                NFT_PIPAPO_AVX2_AND(13,  7,  8);
 488                NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt,  9,  pg[9], bsize);
 489
 490                NFT_PIPAPO_AVX2_AND(0,   9, 10);
 491                NFT_PIPAPO_AVX2_BUCKET_LOAD4(1,  lt, 10,  pg[10], bsize);
 492                NFT_PIPAPO_AVX2_AND(2,  11, 12);
 493                NFT_PIPAPO_AVX2_BUCKET_LOAD4(3,  lt, 11,  pg[11], bsize);
 494                NFT_PIPAPO_AVX2_AND(4,  13, 14);
 495                NFT_PIPAPO_AVX2_AND(5,   0,  1);
 496
 497                NFT_PIPAPO_AVX2_AND(6,   2,  3);
 498
 499                /* Stalls */
 500                NFT_PIPAPO_AVX2_AND(7,   4,  5);
 501                NFT_PIPAPO_AVX2_AND(8,   6,  7);
 502
 503                NFT_PIPAPO_AVX2_NOMATCH_GOTO(8, nomatch);
 504                NFT_PIPAPO_AVX2_STORE(map[i_ul], 8);
 505
 506                b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
 507                if (last)
 508                        return b;
 509
 510                if (unlikely(ret == -1))
 511                        ret = b / XSAVE_YMM_SIZE;
 512
 513                continue;
 514nomatch:
 515                NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
 516nothing:
 517                ;
 518        }
 519
 520        return ret;
 521}
 522
 523/**
 524 * nft_pipapo_avx2_lookup_4b_32() - AVX2-based lookup for 32 four-bit groups
 525 * @map:        Previous match result, used as initial bitmap
 526 * @fill:       Destination bitmap to be filled with current match result
 527 * @f:          Field, containing lookup and mapping tables
 528 * @offset:     Ignore buckets before the given index, no bits are filled there
 529 * @pkt:        Packet data, pointer to input nftables register
 530 * @first:      If this is the first field, don't source previous result
 531 * @last:       Last field: stop at the first match and return bit index
 532 *
 533 * See nft_pipapo_avx2_lookup_4b_2().
 534 *
 535 * This is used for 128-bit fields (i.e. IPv6 addresses).
 536 *
 537 * Return: -1 on no match, rule index of match if @last, otherwise first long
 538 * word index to be checked next (i.e. first filled word).
 539 */
 540static int nft_pipapo_avx2_lookup_4b_32(unsigned long *map, unsigned long *fill,
 541                                        struct nft_pipapo_field *f, int offset,
 542                                        const u8 *pkt, bool first, bool last)
 543{
 544        u8 pg[32] = {  pkt[0] >> 4,  pkt[0] & 0xf,  pkt[1] >> 4,  pkt[1] & 0xf,
 545                       pkt[2] >> 4,  pkt[2] & 0xf,  pkt[3] >> 4,  pkt[3] & 0xf,
 546                       pkt[4] >> 4,  pkt[4] & 0xf,  pkt[5] >> 4,  pkt[5] & 0xf,
 547                       pkt[6] >> 4,  pkt[6] & 0xf,  pkt[7] >> 4,  pkt[7] & 0xf,
 548                       pkt[8] >> 4,  pkt[8] & 0xf,  pkt[9] >> 4,  pkt[9] & 0xf,
 549                      pkt[10] >> 4, pkt[10] & 0xf, pkt[11] >> 4, pkt[11] & 0xf,
 550                      pkt[12] >> 4, pkt[12] & 0xf, pkt[13] >> 4, pkt[13] & 0xf,
 551                      pkt[14] >> 4, pkt[14] & 0xf, pkt[15] >> 4, pkt[15] & 0xf,
 552                    };
 553        int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
 554        unsigned long *lt = f->lt, bsize = f->bsize;
 555
 556        lt += offset * NFT_PIPAPO_LONGS_PER_M256;
 557        for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
 558                int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
 559
 560                if (!first)
 561                        NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
 562
 563                NFT_PIPAPO_AVX2_BUCKET_LOAD4(1,  lt,  0,  pg[0], bsize);
 564                NFT_PIPAPO_AVX2_BUCKET_LOAD4(2,  lt,  1,  pg[1], bsize);
 565                NFT_PIPAPO_AVX2_BUCKET_LOAD4(3,  lt,  2,  pg[2], bsize);
 566                NFT_PIPAPO_AVX2_BUCKET_LOAD4(4,  lt,  3,  pg[3], bsize);
 567                if (!first) {
 568                        NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
 569                        NFT_PIPAPO_AVX2_AND(1, 1, 0);
 570                }
 571
 572                NFT_PIPAPO_AVX2_AND(5,   2,  3);
 573                NFT_PIPAPO_AVX2_BUCKET_LOAD4(6,  lt,  4,  pg[4], bsize);
 574                NFT_PIPAPO_AVX2_BUCKET_LOAD4(7,  lt,  5,  pg[5], bsize);
 575                NFT_PIPAPO_AVX2_AND(8,   1,  4);
 576                NFT_PIPAPO_AVX2_BUCKET_LOAD4(9,  lt,  6,  pg[6], bsize);
 577                NFT_PIPAPO_AVX2_AND(10,  5,  6);
 578                NFT_PIPAPO_AVX2_BUCKET_LOAD4(11, lt,  7,  pg[7], bsize);
 579                NFT_PIPAPO_AVX2_AND(12,  7,  8);
 580                NFT_PIPAPO_AVX2_BUCKET_LOAD4(13, lt,  8,  pg[8], bsize);
 581                NFT_PIPAPO_AVX2_AND(14,  9, 10);
 582
 583                NFT_PIPAPO_AVX2_BUCKET_LOAD4(0,  lt,  9,  pg[9], bsize);
 584                NFT_PIPAPO_AVX2_AND(1,  11, 12);
 585                NFT_PIPAPO_AVX2_BUCKET_LOAD4(2,  lt, 10, pg[10], bsize);
 586                NFT_PIPAPO_AVX2_BUCKET_LOAD4(3,  lt, 11, pg[11], bsize);
 587                NFT_PIPAPO_AVX2_AND(4,  13, 14);
 588                NFT_PIPAPO_AVX2_BUCKET_LOAD4(5,  lt, 12, pg[12], bsize);
 589                NFT_PIPAPO_AVX2_BUCKET_LOAD4(6,  lt, 13, pg[13], bsize);
 590                NFT_PIPAPO_AVX2_AND(7,   0,  1);
 591                NFT_PIPAPO_AVX2_BUCKET_LOAD4(8,  lt, 14, pg[14], bsize);
 592                NFT_PIPAPO_AVX2_AND(9,   2,  3);
 593                NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 15, pg[15], bsize);
 594                NFT_PIPAPO_AVX2_AND(11,  4,  5);
 595                NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 16, pg[16], bsize);
 596                NFT_PIPAPO_AVX2_AND(13,  6,  7);
 597                NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 17, pg[17], bsize);
 598
 599                NFT_PIPAPO_AVX2_AND(0,   8,  9);
 600                NFT_PIPAPO_AVX2_BUCKET_LOAD4(1,  lt, 18, pg[18], bsize);
 601                NFT_PIPAPO_AVX2_AND(2,  10, 11);
 602                NFT_PIPAPO_AVX2_BUCKET_LOAD4(3,  lt, 19, pg[19], bsize);
 603                NFT_PIPAPO_AVX2_AND(4,  12, 13);
 604                NFT_PIPAPO_AVX2_BUCKET_LOAD4(5,  lt, 20, pg[20], bsize);
 605                NFT_PIPAPO_AVX2_AND(6,  14,  0);
 606                NFT_PIPAPO_AVX2_AND(7,   1,  2);
 607                NFT_PIPAPO_AVX2_BUCKET_LOAD4(8,  lt, 21, pg[21], bsize);
 608                NFT_PIPAPO_AVX2_AND(9,   3,  4);
 609                NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 22, pg[22], bsize);
 610                NFT_PIPAPO_AVX2_AND(11,  5,  6);
 611                NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 23, pg[23], bsize);
 612                NFT_PIPAPO_AVX2_AND(13,  7,  8);
 613
 614                NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 24, pg[24], bsize);
 615                NFT_PIPAPO_AVX2_BUCKET_LOAD4(0,  lt, 25, pg[25], bsize);
 616                NFT_PIPAPO_AVX2_AND(1,   9, 10);
 617                NFT_PIPAPO_AVX2_AND(2,  11, 12);
 618                NFT_PIPAPO_AVX2_BUCKET_LOAD4(3,  lt, 26, pg[26], bsize);
 619                NFT_PIPAPO_AVX2_AND(4,  13, 14);
 620                NFT_PIPAPO_AVX2_BUCKET_LOAD4(5,  lt, 27, pg[27], bsize);
 621                NFT_PIPAPO_AVX2_AND(6,   0,  1);
 622                NFT_PIPAPO_AVX2_BUCKET_LOAD4(7,  lt, 28, pg[28], bsize);
 623                NFT_PIPAPO_AVX2_BUCKET_LOAD4(8,  lt, 29, pg[29], bsize);
 624                NFT_PIPAPO_AVX2_AND(9,   2,  3);
 625                NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 30, pg[30], bsize);
 626                NFT_PIPAPO_AVX2_AND(11,  4,  5);
 627                NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 31, pg[31], bsize);
 628
 629                NFT_PIPAPO_AVX2_AND(0,   6,  7);
 630                NFT_PIPAPO_AVX2_AND(1,   8,  9);
 631                NFT_PIPAPO_AVX2_AND(2,  10, 11);
 632                NFT_PIPAPO_AVX2_AND(3,  12,  0);
 633
 634                /* Stalls */
 635                NFT_PIPAPO_AVX2_AND(4,   1,  2);
 636                NFT_PIPAPO_AVX2_AND(5,   3,  4);
 637
 638                NFT_PIPAPO_AVX2_NOMATCH_GOTO(5, nomatch);
 639                NFT_PIPAPO_AVX2_STORE(map[i_ul], 5);
 640
 641                b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
 642                if (last)
 643                        return b;
 644
 645                if (unlikely(ret == -1))
 646                        ret = b / XSAVE_YMM_SIZE;
 647
 648                continue;
 649nomatch:
 650                NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
 651nothing:
 652                ;
 653        }
 654
 655        return ret;
 656}
 657
 658/**
 659 * nft_pipapo_avx2_lookup_8b_1() - AVX2-based lookup for one eight-bit group
 660 * @map:        Previous match result, used as initial bitmap
 661 * @fill:       Destination bitmap to be filled with current match result
 662 * @f:          Field, containing lookup and mapping tables
 663 * @offset:     Ignore buckets before the given index, no bits are filled there
 664 * @pkt:        Packet data, pointer to input nftables register
 665 * @first:      If this is the first field, don't source previous result
 666 * @last:       Last field: stop at the first match and return bit index
 667 *
 668 * See nft_pipapo_avx2_lookup_4b_2().
 669 *
 670 * This is used for 8-bit fields (i.e. protocol numbers).
 671 *
 672 * Return: -1 on no match, rule index of match if @last, otherwise first long
 673 * word index to be checked next (i.e. first filled word).
 674 */
 675static int nft_pipapo_avx2_lookup_8b_1(unsigned long *map, unsigned long *fill,
 676                                       struct nft_pipapo_field *f, int offset,
 677                                       const u8 *pkt, bool first, bool last)
 678{
 679        int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
 680        unsigned long *lt = f->lt, bsize = f->bsize;
 681
 682        lt += offset * NFT_PIPAPO_LONGS_PER_M256;
 683        for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
 684                int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
 685
 686                if (first) {
 687                        NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 0, pkt[0], bsize);
 688                } else {
 689                        NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
 690                        NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
 691                        NFT_PIPAPO_AVX2_AND(2, 0, 1);
 692                        NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
 693                }
 694
 695                NFT_PIPAPO_AVX2_NOMATCH_GOTO(2, nomatch);
 696                NFT_PIPAPO_AVX2_STORE(map[i_ul], 2);
 697
 698                b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
 699                if (last)
 700                        return b;
 701
 702                if (unlikely(ret == -1))
 703                        ret = b / XSAVE_YMM_SIZE;
 704
 705                continue;
 706nomatch:
 707                NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
 708nothing:
 709                ;
 710        }
 711
 712        return ret;
 713}
 714
 715/**
 716 * nft_pipapo_avx2_lookup_8b_2() - AVX2-based lookup for 2 eight-bit groups
 717 * @map:        Previous match result, used as initial bitmap
 718 * @fill:       Destination bitmap to be filled with current match result
 719 * @f:          Field, containing lookup and mapping tables
 720 * @offset:     Ignore buckets before the given index, no bits are filled there
 721 * @pkt:        Packet data, pointer to input nftables register
 722 * @first:      If this is the first field, don't source previous result
 723 * @last:       Last field: stop at the first match and return bit index
 724 *
 725 * See nft_pipapo_avx2_lookup_4b_2().
 726 *
 727 * This is used for 16-bit fields (i.e. ports).
 728 *
 729 * Return: -1 on no match, rule index of match if @last, otherwise first long
 730 * word index to be checked next (i.e. first filled word).
 731 */
 732static int nft_pipapo_avx2_lookup_8b_2(unsigned long *map, unsigned long *fill,
 733                                       struct nft_pipapo_field *f, int offset,
 734                                       const u8 *pkt, bool first, bool last)
 735{
 736        int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
 737        unsigned long *lt = f->lt, bsize = f->bsize;
 738
 739        lt += offset * NFT_PIPAPO_LONGS_PER_M256;
 740        for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
 741                int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
 742
 743                if (first) {
 744                        NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
 745                        NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize);
 746                        NFT_PIPAPO_AVX2_AND(4, 0, 1);
 747                } else {
 748                        NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
 749                        NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 0, pkt[0], bsize);
 750                        NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize);
 751
 752                        /* Stall */
 753                        NFT_PIPAPO_AVX2_AND(3, 0, 1);
 754                        NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
 755                        NFT_PIPAPO_AVX2_AND(4, 3, 2);
 756                }
 757
 758                /* Stall */
 759                NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch);
 760                NFT_PIPAPO_AVX2_STORE(map[i_ul], 4);
 761
 762                b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
 763                if (last)
 764                        return b;
 765
 766                if (unlikely(ret == -1))
 767                        ret = b / XSAVE_YMM_SIZE;
 768
 769                continue;
 770nomatch:
 771                NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
 772nothing:
 773                ;
 774        }
 775
 776        return ret;
 777}
 778
 779/**
 780 * nft_pipapo_avx2_lookup_8b_4() - AVX2-based lookup for 4 eight-bit groups
 781 * @map:        Previous match result, used as initial bitmap
 782 * @fill:       Destination bitmap to be filled with current match result
 783 * @f:          Field, containing lookup and mapping tables
 784 * @offset:     Ignore buckets before the given index, no bits are filled there
 785 * @pkt:        Packet data, pointer to input nftables register
 786 * @first:      If this is the first field, don't source previous result
 787 * @last:       Last field: stop at the first match and return bit index
 788 *
 789 * See nft_pipapo_avx2_lookup_4b_2().
 790 *
 791 * This is used for 32-bit fields (i.e. IPv4 addresses).
 792 *
 793 * Return: -1 on no match, rule index of match if @last, otherwise first long
 794 * word index to be checked next (i.e. first filled word).
 795 */
 796static int nft_pipapo_avx2_lookup_8b_4(unsigned long *map, unsigned long *fill,
 797                                       struct nft_pipapo_field *f, int offset,
 798                                       const u8 *pkt, bool first, bool last)
 799{
 800        int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
 801        unsigned long *lt = f->lt, bsize = f->bsize;
 802
 803        lt += offset * NFT_PIPAPO_LONGS_PER_M256;
 804        for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
 805                int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
 806
 807                if (first) {
 808                        NFT_PIPAPO_AVX2_BUCKET_LOAD8(0,  lt, 0, pkt[0], bsize);
 809                        NFT_PIPAPO_AVX2_BUCKET_LOAD8(1,  lt, 1, pkt[1], bsize);
 810                        NFT_PIPAPO_AVX2_BUCKET_LOAD8(2,  lt, 2, pkt[2], bsize);
 811                        NFT_PIPAPO_AVX2_BUCKET_LOAD8(3,  lt, 3, pkt[3], bsize);
 812
 813                        /* Stall */
 814                        NFT_PIPAPO_AVX2_AND(4, 0, 1);
 815                        NFT_PIPAPO_AVX2_AND(5, 2, 3);
 816                        NFT_PIPAPO_AVX2_AND(0, 4, 5);
 817                } else {
 818                        NFT_PIPAPO_AVX2_BUCKET_LOAD8(0,  lt, 0, pkt[0], bsize);
 819                        NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
 820                        NFT_PIPAPO_AVX2_BUCKET_LOAD8(2,  lt, 1, pkt[1], bsize);
 821                        NFT_PIPAPO_AVX2_BUCKET_LOAD8(3,  lt, 2, pkt[2], bsize);
 822                        NFT_PIPAPO_AVX2_BUCKET_LOAD8(4,  lt, 3, pkt[3], bsize);
 823
 824                        NFT_PIPAPO_AVX2_AND(5, 0, 1);
 825                        NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
 826                        NFT_PIPAPO_AVX2_AND(6, 2, 3);
 827
 828                        /* Stall */
 829                        NFT_PIPAPO_AVX2_AND(7, 4, 5);
 830                        NFT_PIPAPO_AVX2_AND(0, 6, 7);
 831                }
 832
 833                NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nomatch);
 834                NFT_PIPAPO_AVX2_STORE(map[i_ul], 0);
 835
 836                b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
 837                if (last)
 838                        return b;
 839
 840                if (unlikely(ret == -1))
 841                        ret = b / XSAVE_YMM_SIZE;
 842
 843                continue;
 844
 845nomatch:
 846                NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
 847nothing:
 848                ;
 849        }
 850
 851        return ret;
 852}
 853
 854/**
 855 * nft_pipapo_avx2_lookup_8b_6() - AVX2-based lookup for 6 eight-bit groups
 856 * @map:        Previous match result, used as initial bitmap
 857 * @fill:       Destination bitmap to be filled with current match result
 858 * @f:          Field, containing lookup and mapping tables
 859 * @offset:     Ignore buckets before the given index, no bits are filled there
 860 * @pkt:        Packet data, pointer to input nftables register
 861 * @first:      If this is the first field, don't source previous result
 862 * @last:       Last field: stop at the first match and return bit index
 863 *
 864 * See nft_pipapo_avx2_lookup_4b_2().
 865 *
 866 * This is used for 48-bit fields (i.e. MAC addresses/EUI-48).
 867 *
 868 * Return: -1 on no match, rule index of match if @last, otherwise first long
 869 * word index to be checked next (i.e. first filled word).
 870 */
 871static int nft_pipapo_avx2_lookup_8b_6(unsigned long *map, unsigned long *fill,
 872                                       struct nft_pipapo_field *f, int offset,
 873                                       const u8 *pkt, bool first, bool last)
 874{
 875        int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
 876        unsigned long *lt = f->lt, bsize = f->bsize;
 877
 878        lt += offset * NFT_PIPAPO_LONGS_PER_M256;
 879        for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
 880                int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
 881
 882                if (first) {
 883                        NFT_PIPAPO_AVX2_BUCKET_LOAD8(0,  lt, 0, pkt[0], bsize);
 884                        NFT_PIPAPO_AVX2_BUCKET_LOAD8(1,  lt, 1, pkt[1], bsize);
 885                        NFT_PIPAPO_AVX2_BUCKET_LOAD8(2,  lt, 2, pkt[2], bsize);
 886                        NFT_PIPAPO_AVX2_BUCKET_LOAD8(3,  lt, 3, pkt[3], bsize);
 887                        NFT_PIPAPO_AVX2_BUCKET_LOAD8(4,  lt, 4, pkt[4], bsize);
 888
 889                        NFT_PIPAPO_AVX2_AND(5, 0, 1);
 890                        NFT_PIPAPO_AVX2_BUCKET_LOAD8(6,  lt, 6, pkt[5], bsize);
 891                        NFT_PIPAPO_AVX2_AND(7, 2, 3);
 892
 893                        /* Stall */
 894                        NFT_PIPAPO_AVX2_AND(0, 4, 5);
 895                        NFT_PIPAPO_AVX2_AND(1, 6, 7);
 896                        NFT_PIPAPO_AVX2_AND(4, 0, 1);
 897                } else {
 898                        NFT_PIPAPO_AVX2_BUCKET_LOAD8(0,  lt, 0, pkt[0], bsize);
 899                        NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
 900                        NFT_PIPAPO_AVX2_BUCKET_LOAD8(2,  lt, 1, pkt[1], bsize);
 901                        NFT_PIPAPO_AVX2_BUCKET_LOAD8(3,  lt, 2, pkt[2], bsize);
 902                        NFT_PIPAPO_AVX2_BUCKET_LOAD8(4,  lt, 3, pkt[3], bsize);
 903
 904                        NFT_PIPAPO_AVX2_AND(5, 0, 1);
 905                        NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
 906
 907                        NFT_PIPAPO_AVX2_AND(6, 2, 3);
 908                        NFT_PIPAPO_AVX2_BUCKET_LOAD8(7,  lt, 4, pkt[4], bsize);
 909                        NFT_PIPAPO_AVX2_AND(0, 4, 5);
 910                        NFT_PIPAPO_AVX2_BUCKET_LOAD8(1,  lt, 5, pkt[5], bsize);
 911                        NFT_PIPAPO_AVX2_AND(2, 6, 7);
 912
 913                        /* Stall */
 914                        NFT_PIPAPO_AVX2_AND(3, 0, 1);
 915                        NFT_PIPAPO_AVX2_AND(4, 2, 3);
 916                }
 917
 918                NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch);
 919                NFT_PIPAPO_AVX2_STORE(map[i_ul], 4);
 920
 921                b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
 922                if (last)
 923                        return b;
 924
 925                if (unlikely(ret == -1))
 926                        ret = b / XSAVE_YMM_SIZE;
 927
 928                continue;
 929
 930nomatch:
 931                NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
 932nothing:
 933                ;
 934        }
 935
 936        return ret;
 937}
 938
 939/**
 940 * nft_pipapo_avx2_lookup_8b_16() - AVX2-based lookup for 16 eight-bit groups
 941 * @map:        Previous match result, used as initial bitmap
 942 * @fill:       Destination bitmap to be filled with current match result
 943 * @f:          Field, containing lookup and mapping tables
 944 * @offset:     Ignore buckets before the given index, no bits are filled there
 945 * @pkt:        Packet data, pointer to input nftables register
 946 * @first:      If this is the first field, don't source previous result
 947 * @last:       Last field: stop at the first match and return bit index
 948 *
 949 * See nft_pipapo_avx2_lookup_4b_2().
 950 *
 951 * This is used for 128-bit fields (i.e. IPv6 addresses).
 952 *
 953 * Return: -1 on no match, rule index of match if @last, otherwise first long
 954 * word index to be checked next (i.e. first filled word).
 955 */
 956static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill,
 957                                        struct nft_pipapo_field *f, int offset,
 958                                        const u8 *pkt, bool first, bool last)
 959{
 960        int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
 961        unsigned long *lt = f->lt, bsize = f->bsize;
 962
 963        lt += offset * NFT_PIPAPO_LONGS_PER_M256;
 964        for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
 965                int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
 966
 967                if (!first)
 968                        NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
 969
 970                NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt,  0,  pkt[0], bsize);
 971                NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt,  1,  pkt[1], bsize);
 972                NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt,  2,  pkt[2], bsize);
 973                if (!first) {
 974                        NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
 975                        NFT_PIPAPO_AVX2_AND(1, 1, 0);
 976                }
 977                NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt,  3,  pkt[3], bsize);
 978
 979                NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt,  4,  pkt[4], bsize);
 980                NFT_PIPAPO_AVX2_AND(6, 1, 2);
 981                NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt,  5,  pkt[5], bsize);
 982                NFT_PIPAPO_AVX2_AND(0, 3, 4);
 983                NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt,  6,  pkt[6], bsize);
 984
 985                NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt,  7,  pkt[7], bsize);
 986                NFT_PIPAPO_AVX2_AND(3, 5, 6);
 987                NFT_PIPAPO_AVX2_AND(4, 0, 1);
 988                NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt,  8,  pkt[8], bsize);
 989
 990                NFT_PIPAPO_AVX2_AND(6, 2, 3);
 991                NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt,  9,  pkt[9], bsize);
 992                NFT_PIPAPO_AVX2_AND(0, 4, 5);
 993                NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 10, pkt[10], bsize);
 994                NFT_PIPAPO_AVX2_AND(2, 6, 7);
 995                NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 11, pkt[11], bsize);
 996                NFT_PIPAPO_AVX2_AND(4, 0, 1);
 997                NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 12, pkt[12], bsize);
 998                NFT_PIPAPO_AVX2_AND(6, 2, 3);
 999                NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 13, pkt[13], bsize);
1000                NFT_PIPAPO_AVX2_AND(0, 4, 5);
1001                NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 14, pkt[14], bsize);
1002                NFT_PIPAPO_AVX2_AND(2, 6, 7);
1003                NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 15, pkt[15], bsize);
1004                NFT_PIPAPO_AVX2_AND(4, 0, 1);
1005
1006                /* Stall */
1007                NFT_PIPAPO_AVX2_AND(5, 2, 3);
1008                NFT_PIPAPO_AVX2_AND(6, 4, 5);
1009
1010                NFT_PIPAPO_AVX2_NOMATCH_GOTO(6, nomatch);
1011                NFT_PIPAPO_AVX2_STORE(map[i_ul], 6);
1012
1013                b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
1014                if (last)
1015                        return b;
1016
1017                if (unlikely(ret == -1))
1018                        ret = b / XSAVE_YMM_SIZE;
1019
1020                continue;
1021
1022nomatch:
1023                NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
1024nothing:
1025                ;
1026        }
1027
1028        return ret;
1029}
1030
1031/**
1032 * nft_pipapo_avx2_lookup_slow() - Fallback function for uncommon field sizes
1033 * @map:        Previous match result, used as initial bitmap
1034 * @fill:       Destination bitmap to be filled with current match result
1035 * @f:          Field, containing lookup and mapping tables
1036 * @offset:     Ignore buckets before the given index, no bits are filled there
1037 * @pkt:        Packet data, pointer to input nftables register
1038 * @first:      If this is the first field, don't source previous result
1039 * @last:       Last field: stop at the first match and return bit index
1040 *
1041 * This function should never be called, but is provided for the case the field
1042 * size doesn't match any of the known data types. Matching rate is
1043 * substantially lower than AVX2 routines.
1044 *
1045 * Return: -1 on no match, rule index of match if @last, otherwise first long
1046 * word index to be checked next (i.e. first filled word).
1047 */
1048static int nft_pipapo_avx2_lookup_slow(unsigned long *map, unsigned long *fill,
1049                                        struct nft_pipapo_field *f, int offset,
1050                                        const u8 *pkt, bool first, bool last)
1051{
1052        unsigned long *lt = f->lt, bsize = f->bsize;
1053        int i, ret = -1, b;
1054
1055        lt += offset * NFT_PIPAPO_LONGS_PER_M256;
1056
1057        if (first)
1058                memset(map, 0xff, bsize * sizeof(*map));
1059
1060        for (i = offset; i < bsize; i++) {
1061                if (f->bb == 8)
1062                        pipapo_and_field_buckets_8bit(f, map, pkt);
1063                else
1064                        pipapo_and_field_buckets_4bit(f, map, pkt);
1065                NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4;
1066
1067                b = pipapo_refill(map, bsize, f->rules, fill, f->mt, last);
1068
1069                if (last)
1070                        return b;
1071
1072                if (ret == -1)
1073                        ret = b / XSAVE_YMM_SIZE;
1074        }
1075
1076        return ret;
1077}
1078
1079/**
1080 * nft_pipapo_avx2_estimate() - Set size, space and lookup complexity
1081 * @desc:       Set description, element count and field description used
1082 * @features:   Flags: NFT_SET_INTERVAL needs to be there
1083 * @est:        Storage for estimation data
1084 *
1085 * Return: true if set is compatible and AVX2 available, false otherwise.
1086 */
1087bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features,
1088                              struct nft_set_estimate *est)
1089{
1090        if (!(features & NFT_SET_INTERVAL) ||
1091            desc->field_count < NFT_PIPAPO_MIN_FIELDS)
1092                return false;
1093
1094        if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_AVX))
1095                return false;
1096
1097        est->size = pipapo_estimate_size(desc);
1098        if (!est->size)
1099                return false;
1100
1101        est->lookup = NFT_SET_CLASS_O_LOG_N;
1102
1103        est->space = NFT_SET_CLASS_O_N;
1104
1105        return true;
1106}
1107
1108/**
1109 * nft_pipapo_avx2_lookup() - Lookup function for AVX2 implementation
1110 * @net:        Network namespace
1111 * @set:        nftables API set representation
1112 * @elem:       nftables API element representation containing key data
1113 * @ext:        nftables API extension pointer, filled with matching reference
1114 *
1115 * For more details, see DOC: Theory of Operation in nft_set_pipapo.c.
1116 *
1117 * This implementation exploits the repetitive characteristic of the algorithm
1118 * to provide a fast, vectorised version using the AVX2 SIMD instruction set.
1119 *
1120 * Return: true on match, false otherwise.
1121 */
1122bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
1123                            const u32 *key, const struct nft_set_ext **ext)
1124{
1125        struct nft_pipapo *priv = nft_set_priv(set);
1126        unsigned long *res, *fill, *scratch;
1127        u8 genmask = nft_genmask_cur(net);
1128        const u8 *rp = (const u8 *)key;
1129        struct nft_pipapo_match *m;
1130        struct nft_pipapo_field *f;
1131        bool map_index;
1132        int i, ret = 0;
1133
1134        m = rcu_dereference(priv->match);
1135
1136        /* This also protects access to all data related to scratch maps */
1137        kernel_fpu_begin();
1138
1139        scratch = *raw_cpu_ptr(m->scratch_aligned);
1140        if (unlikely(!scratch)) {
1141                kernel_fpu_end();
1142                return false;
1143        }
1144        map_index = raw_cpu_read(nft_pipapo_avx2_scratch_index);
1145
1146        res  = scratch + (map_index ? m->bsize_max : 0);
1147        fill = scratch + (map_index ? 0 : m->bsize_max);
1148
1149        /* Starting map doesn't need to be set for this implementation */
1150
1151        nft_pipapo_avx2_prepare();
1152
1153next_match:
1154        nft_pipapo_for_each_field(f, i, m) {
1155                bool last = i == m->field_count - 1, first = !i;
1156
1157#define NFT_SET_PIPAPO_AVX2_LOOKUP(b, n)                                \
1158                (ret = nft_pipapo_avx2_lookup_##b##b_##n(res, fill, f,  \
1159                                                         ret, rp,       \
1160                                                         first, last))
1161
1162                if (likely(f->bb == 8)) {
1163                        if (f->groups == 1) {
1164                                NFT_SET_PIPAPO_AVX2_LOOKUP(8, 1);
1165                        } else if (f->groups == 2) {
1166                                NFT_SET_PIPAPO_AVX2_LOOKUP(8, 2);
1167                        } else if (f->groups == 4) {
1168                                NFT_SET_PIPAPO_AVX2_LOOKUP(8, 4);
1169                        } else if (f->groups == 6) {
1170                                NFT_SET_PIPAPO_AVX2_LOOKUP(8, 6);
1171                        } else if (f->groups == 16) {
1172                                NFT_SET_PIPAPO_AVX2_LOOKUP(8, 16);
1173                        } else {
1174                                ret = nft_pipapo_avx2_lookup_slow(res, fill, f,
1175                                                                  ret, rp,
1176                                                                  first, last);
1177                        }
1178                } else {
1179                        if (f->groups == 2) {
1180                                NFT_SET_PIPAPO_AVX2_LOOKUP(4, 2);
1181                        } else if (f->groups == 4) {
1182                                NFT_SET_PIPAPO_AVX2_LOOKUP(4, 4);
1183                        } else if (f->groups == 8) {
1184                                NFT_SET_PIPAPO_AVX2_LOOKUP(4, 8);
1185                        } else if (f->groups == 12) {
1186                                NFT_SET_PIPAPO_AVX2_LOOKUP(4, 12);
1187                        } else if (f->groups == 32) {
1188                                NFT_SET_PIPAPO_AVX2_LOOKUP(4, 32);
1189                        } else {
1190                                ret = nft_pipapo_avx2_lookup_slow(res, fill, f,
1191                                                                  ret, rp,
1192                                                                  first, last);
1193                        }
1194                }
1195                NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4;
1196
1197#undef NFT_SET_PIPAPO_AVX2_LOOKUP
1198
1199                if (ret < 0)
1200                        goto out;
1201
1202                if (last) {
1203                        *ext = &f->mt[ret].e->ext;
1204                        if (unlikely(nft_set_elem_expired(*ext) ||
1205                                     !nft_set_elem_active(*ext, genmask))) {
1206                                ret = 0;
1207                                goto next_match;
1208                        }
1209
1210                        goto out;
1211                }
1212
1213                swap(res, fill);
1214                rp += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
1215        }
1216
1217out:
1218        if (i % 2)
1219                raw_cpu_write(nft_pipapo_avx2_scratch_index, !map_index);
1220        kernel_fpu_end();
1221
1222        return ret >= 0;
1223}
1224