linux/arch/x86/crypto/sha1_avx2_x86_64_asm.S
<<
>>
Prefs
   1/*
   2 *      Implement fast SHA-1 with AVX2 instructions. (x86_64)
   3 *
   4 * This file is provided under a dual BSD/GPLv2 license.  When using or
   5 * redistributing this file, you may do so under either license.
   6 *
   7 * GPL LICENSE SUMMARY
   8 *
   9 * Copyright(c) 2014 Intel Corporation.
  10 *
  11 * This program is free software; you can redistribute it and/or modify
  12 * it under the terms of version 2 of the GNU General Public License as
  13 * published by the Free Software Foundation.
  14 *
  15 * This program is distributed in the hope that it will be useful, but
  16 * WITHOUT ANY WARRANTY; without even the implied warranty of
  17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18 * General Public License for more details.
  19 *
  20 * Contact Information:
  21 * Ilya Albrekht <ilya.albrekht@intel.com>
  22 * Maxim Locktyukhin <maxim.locktyukhin@intel.com>
  23 * Ronen Zohar <ronen.zohar@intel.com>
  24 * Chandramouli Narayanan <mouli@linux.intel.com>
  25 *
  26 * BSD LICENSE
  27 *
  28 * Copyright(c) 2014 Intel Corporation.
  29 *
  30 * Redistribution and use in source and binary forms, with or without
  31 * modification, are permitted provided that the following conditions
  32 * are met:
  33 *
  34 * Redistributions of source code must retain the above copyright
  35 * notice, this list of conditions and the following disclaimer.
  36 * Redistributions in binary form must reproduce the above copyright
  37 * notice, this list of conditions and the following disclaimer in
  38 * the documentation and/or other materials provided with the
  39 * distribution.
  40 * Neither the name of Intel Corporation nor the names of its
  41 * contributors may be used to endorse or promote products derived
  42 * from this software without specific prior written permission.
  43 *
  44 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  45 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  46 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  47 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  48 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  49 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  50 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  51 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  52 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  53 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  54 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  55 *
  56 */
  57
  58/*
  59 * SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
  60 *
  61 *This implementation is based on the previous SSSE3 release:
  62 *Visit http://software.intel.com/en-us/articles/
  63 *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
  64 *
  65 *Updates 20-byte SHA-1 record at start of 'state', from 'input', for
  66 *even number of 'blocks' consecutive 64-byte blocks.
  67 *
  68 *extern "C" void sha1_transform_avx2(
  69 *      struct sha1_state *state, const u8* input, int blocks );
  70 */
  71
  72#include <linux/linkage.h>
  73
  74#define CTX     %rdi    /* arg1 */
  75#define BUF     %rsi    /* arg2 */
  76#define CNT     %rdx    /* arg3 */
  77
  78#define REG_A   %ecx
  79#define REG_B   %esi
  80#define REG_C   %edi
  81#define REG_D   %eax
  82#define REG_E   %edx
  83#define REG_TB  %ebx
  84#define REG_TA  %r12d
  85#define REG_RA  %rcx
  86#define REG_RB  %rsi
  87#define REG_RC  %rdi
  88#define REG_RD  %rax
  89#define REG_RE  %rdx
  90#define REG_RTA %r12
  91#define REG_RTB %rbx
  92#define REG_T1  %r11d
  93#define xmm_mov vmovups
  94#define avx2_zeroupper  vzeroupper
  95#define RND_F1  1
  96#define RND_F2  2
  97#define RND_F3  3
  98
  99.macro REGALLOC
 100        .set A, REG_A
 101        .set B, REG_B
 102        .set C, REG_C
 103        .set D, REG_D
 104        .set E, REG_E
 105        .set TB, REG_TB
 106        .set TA, REG_TA
 107
 108        .set RA, REG_RA
 109        .set RB, REG_RB
 110        .set RC, REG_RC
 111        .set RD, REG_RD
 112        .set RE, REG_RE
 113
 114        .set RTA, REG_RTA
 115        .set RTB, REG_RTB
 116
 117        .set T1, REG_T1
 118.endm
 119
 120#define HASH_PTR        %r9
 121#define BLOCKS_CTR      %r8
 122#define BUFFER_PTR      %r10
 123#define BUFFER_PTR2     %r13
 124
 125#define PRECALC_BUF     %r14
 126#define WK_BUF          %r15
 127
 128#define W_TMP           %xmm0
 129#define WY_TMP          %ymm0
 130#define WY_TMP2         %ymm9
 131
 132# AVX2 variables
 133#define WY0             %ymm3
 134#define WY4             %ymm5
 135#define WY08            %ymm7
 136#define WY12            %ymm8
 137#define WY16            %ymm12
 138#define WY20            %ymm13
 139#define WY24            %ymm14
 140#define WY28            %ymm15
 141
 142#define YMM_SHUFB_BSWAP %ymm10
 143
 144/*
 145 * Keep 2 iterations precalculated at a time:
 146 *    - 80 DWORDs per iteration * 2
 147 */
 148#define W_SIZE          (80*2*2 +16)
 149
 150#define WK(t)   ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
 151#define PRECALC_WK(t)   ((t)*2*2)(PRECALC_BUF)
 152
 153
 154.macro UPDATE_HASH  hash, val
 155        add     \hash, \val
 156        mov     \val, \hash
 157.endm
 158
 159.macro PRECALC_RESET_WY
 160        .set WY_00, WY0
 161        .set WY_04, WY4
 162        .set WY_08, WY08
 163        .set WY_12, WY12
 164        .set WY_16, WY16
 165        .set WY_20, WY20
 166        .set WY_24, WY24
 167        .set WY_28, WY28
 168        .set WY_32, WY_00
 169.endm
 170
 171.macro PRECALC_ROTATE_WY
 172        /* Rotate macros */
 173        .set WY_32, WY_28
 174        .set WY_28, WY_24
 175        .set WY_24, WY_20
 176        .set WY_20, WY_16
 177        .set WY_16, WY_12
 178        .set WY_12, WY_08
 179        .set WY_08, WY_04
 180        .set WY_04, WY_00
 181        .set WY_00, WY_32
 182
 183        /* Define register aliases */
 184        .set WY, WY_00
 185        .set WY_minus_04, WY_04
 186        .set WY_minus_08, WY_08
 187        .set WY_minus_12, WY_12
 188        .set WY_minus_16, WY_16
 189        .set WY_minus_20, WY_20
 190        .set WY_minus_24, WY_24
 191        .set WY_minus_28, WY_28
 192        .set WY_minus_32, WY
 193.endm
 194
 195.macro PRECALC_00_15
 196        .if (i == 0) # Initialize and rotate registers
 197                PRECALC_RESET_WY
 198                PRECALC_ROTATE_WY
 199        .endif
 200
 201        /* message scheduling pre-compute for rounds 0-15 */
 202        .if   ((i & 7) == 0)
 203                /*
 204                 * blended AVX2 and ALU instruction scheduling
 205                 * 1 vector iteration per 8 rounds
 206                 */
 207                vmovdqu (i * 2)(BUFFER_PTR), W_TMP
 208        .elseif ((i & 7) == 1)
 209                vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
 210                         WY_TMP, WY_TMP
 211        .elseif ((i & 7) == 2)
 212                vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
 213        .elseif ((i & 7) == 4)
 214                vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
 215        .elseif ((i & 7) == 7)
 216                vmovdqu  WY_TMP, PRECALC_WK(i&~7)
 217
 218                PRECALC_ROTATE_WY
 219        .endif
 220.endm
 221
 222.macro PRECALC_16_31
 223        /*
 224         * message scheduling pre-compute for rounds 16-31
 225         * calculating last 32 w[i] values in 8 XMM registers
 226         * pre-calculate K+w[i] values and store to mem
 227         * for later load by ALU add instruction
 228         *
 229         * "brute force" vectorization for rounds 16-31 only
 230         * due to w[i]->w[i-3] dependency
 231         */
 232        .if   ((i & 7) == 0)
 233                /*
 234                 * blended AVX2 and ALU instruction scheduling
 235                 * 1 vector iteration per 8 rounds
 236                 */
 237                /* w[i-14] */
 238                vpalignr        $8, WY_minus_16, WY_minus_12, WY
 239                vpsrldq $4, WY_minus_04, WY_TMP               /* w[i-3] */
 240        .elseif ((i & 7) == 1)
 241                vpxor   WY_minus_08, WY, WY
 242                vpxor   WY_minus_16, WY_TMP, WY_TMP
 243        .elseif ((i & 7) == 2)
 244                vpxor   WY_TMP, WY, WY
 245                vpslldq $12, WY, WY_TMP2
 246        .elseif ((i & 7) == 3)
 247                vpslld  $1, WY, WY_TMP
 248                vpsrld  $31, WY, WY
 249        .elseif ((i & 7) == 4)
 250                vpor    WY, WY_TMP, WY_TMP
 251                vpslld  $2, WY_TMP2, WY
 252        .elseif ((i & 7) == 5)
 253                vpsrld  $30, WY_TMP2, WY_TMP2
 254                vpxor   WY, WY_TMP, WY_TMP
 255        .elseif ((i & 7) == 7)
 256                vpxor   WY_TMP2, WY_TMP, WY
 257                vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
 258                vmovdqu WY_TMP, PRECALC_WK(i&~7)
 259
 260                PRECALC_ROTATE_WY
 261        .endif
 262.endm
 263
 264.macro PRECALC_32_79
 265        /*
 266         * in SHA-1 specification:
 267         * w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
 268         * instead we do equal:
 269         * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
 270         * allows more efficient vectorization
 271         * since w[i]=>w[i-3] dependency is broken
 272         */
 273
 274        .if   ((i & 7) == 0)
 275        /*
 276         * blended AVX2 and ALU instruction scheduling
 277         * 1 vector iteration per 8 rounds
 278         */
 279                vpalignr        $8, WY_minus_08, WY_minus_04, WY_TMP
 280        .elseif ((i & 7) == 1)
 281                /* W is W_minus_32 before xor */
 282                vpxor   WY_minus_28, WY, WY
 283        .elseif ((i & 7) == 2)
 284                vpxor   WY_minus_16, WY_TMP, WY_TMP
 285        .elseif ((i & 7) == 3)
 286                vpxor   WY_TMP, WY, WY
 287        .elseif ((i & 7) == 4)
 288                vpslld  $2, WY, WY_TMP
 289        .elseif ((i & 7) == 5)
 290                vpsrld  $30, WY, WY
 291                vpor    WY, WY_TMP, WY
 292        .elseif ((i & 7) == 7)
 293                vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
 294                vmovdqu WY_TMP, PRECALC_WK(i&~7)
 295
 296                PRECALC_ROTATE_WY
 297        .endif
 298.endm
 299
 300.macro PRECALC r, s
 301        .set i, \r
 302
 303        .if (i < 40)
 304                .set K_XMM, 32*0
 305        .elseif (i < 80)
 306                .set K_XMM, 32*1
 307        .elseif (i < 120)
 308                .set K_XMM, 32*2
 309        .else
 310                .set K_XMM, 32*3
 311        .endif
 312
 313        .if (i<32)
 314                PRECALC_00_15   \s
 315        .elseif (i<64)
 316                PRECALC_16_31   \s
 317        .elseif (i < 160)
 318                PRECALC_32_79   \s
 319        .endif
 320.endm
 321
 322.macro ROTATE_STATE
 323        .set T_REG, E
 324        .set E, D
 325        .set D, C
 326        .set C, B
 327        .set B, TB
 328        .set TB, A
 329        .set A, T_REG
 330
 331        .set T_REG, RE
 332        .set RE, RD
 333        .set RD, RC
 334        .set RC, RB
 335        .set RB, RTB
 336        .set RTB, RA
 337        .set RA, T_REG
 338.endm
 339
 340/* Macro relies on saved ROUND_Fx */
 341
 342.macro RND_FUN f, r
 343        .if (\f == RND_F1)
 344                ROUND_F1        \r
 345        .elseif (\f == RND_F2)
 346                ROUND_F2        \r
 347        .elseif (\f == RND_F3)
 348                ROUND_F3        \r
 349        .endif
 350.endm
 351
 352.macro RR r
 353        .set round_id, (\r % 80)
 354
 355        .if (round_id == 0)        /* Precalculate F for first round */
 356                .set ROUND_FUNC, RND_F1
 357                mov     B, TB
 358
 359                rorx    $(32-30), B, B    /* b>>>2 */
 360                andn    D, TB, T1
 361                and     C, TB
 362                xor     T1, TB
 363        .endif
 364
 365        RND_FUN ROUND_FUNC, \r
 366        ROTATE_STATE
 367
 368        .if   (round_id == 18)
 369                .set ROUND_FUNC, RND_F2
 370        .elseif (round_id == 38)
 371                .set ROUND_FUNC, RND_F3
 372        .elseif (round_id == 58)
 373                .set ROUND_FUNC, RND_F2
 374        .endif
 375
 376        .set round_id, ( (\r+1) % 80)
 377
 378        RND_FUN ROUND_FUNC, (\r+1)
 379        ROTATE_STATE
 380.endm
 381
 382.macro ROUND_F1 r
 383        add     WK(\r), E
 384
 385        andn    C, A, T1                        /* ~b&d */
 386        lea     (RE,RTB), E             /* Add F from the previous round */
 387
 388        rorx    $(32-5), A, TA          /* T2 = A >>> 5 */
 389        rorx    $(32-30),A, TB          /* b>>>2 for next round */
 390
 391        PRECALC (\r)                    /* msg scheduling for next 2 blocks */
 392
 393        /*
 394         * Calculate F for the next round
 395         * (b & c) ^ andn[b, d]
 396         */
 397        and     B, A                    /* b&c */
 398        xor     T1, A                   /* F1 = (b&c) ^ (~b&d) */
 399
 400        lea     (RE,RTA), E             /* E += A >>> 5 */
 401.endm
 402
 403.macro ROUND_F2 r
 404        add     WK(\r), E
 405        lea     (RE,RTB), E             /* Add F from the previous round */
 406
 407        /* Calculate F for the next round */
 408        rorx    $(32-5), A, TA          /* T2 = A >>> 5 */
 409        .if ((round_id) < 79)
 410                rorx    $(32-30), A, TB /* b>>>2 for next round */
 411        .endif
 412        PRECALC (\r)                    /* msg scheduling for next 2 blocks */
 413
 414        .if ((round_id) < 79)
 415                xor     B, A
 416        .endif
 417
 418        add     TA, E                   /* E += A >>> 5 */
 419
 420        .if ((round_id) < 79)
 421                xor     C, A
 422        .endif
 423.endm
 424
 425.macro ROUND_F3 r
 426        add     WK(\r), E
 427        PRECALC (\r)                    /* msg scheduling for next 2 blocks */
 428
 429        lea     (RE,RTB), E             /* Add F from the previous round */
 430
 431        mov     B, T1
 432        or      A, T1
 433
 434        rorx    $(32-5), A, TA          /* T2 = A >>> 5 */
 435        rorx    $(32-30), A, TB         /* b>>>2 for next round */
 436
 437        /* Calculate F for the next round
 438         * (b and c) or (d and (b or c))
 439         */
 440        and     C, T1
 441        and     B, A
 442        or      T1, A
 443
 444        add     TA, E                   /* E += A >>> 5 */
 445
 446.endm
 447
 448/* Add constant only if (%2 > %3) condition met (uses RTA as temp)
 449 * %1 + %2 >= %3 ? %4 : 0
 450 */
 451.macro ADD_IF_GE a, b, c, d
 452        mov     \a, RTA
 453        add     $\d, RTA
 454        cmp     $\c, \b
 455        cmovge  RTA, \a
 456.endm
 457
 458/*
 459 * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
 460 */
 461.macro SHA1_PIPELINED_MAIN_BODY
 462
 463        REGALLOC
 464
 465        mov     (HASH_PTR), A
 466        mov     4(HASH_PTR), B
 467        mov     8(HASH_PTR), C
 468        mov     12(HASH_PTR), D
 469        mov     16(HASH_PTR), E
 470
 471        mov     %rsp, PRECALC_BUF
 472        lea     (2*4*80+32)(%rsp), WK_BUF
 473
 474        # Precalc WK for first 2 blocks
 475        ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
 476        .set i, 0
 477        .rept    160
 478                PRECALC i
 479                .set i, i + 1
 480        .endr
 481
 482        /* Go to next block if needed */
 483        ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
 484        ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
 485        xchg    WK_BUF, PRECALC_BUF
 486
 487        .align 32
 488_loop:
 489        /*
 490         * code loops through more than one block
 491         * we use K_BASE value as a signal of a last block,
 492         * it is set below by: cmovae BUFFER_PTR, K_BASE
 493         */
 494        test BLOCKS_CTR, BLOCKS_CTR
 495        jnz _begin
 496        .align 32
 497        jmp     _end
 498        .align 32
 499_begin:
 500
 501        /*
 502         * Do first block
 503         * rounds: 0,2,4,6,8
 504         */
 505        .set j, 0
 506        .rept 5
 507                RR      j
 508                .set j, j+2
 509        .endr
 510
 511        jmp _loop0
 512_loop0:
 513
 514        /*
 515         * rounds:
 516         * 10,12,14,16,18
 517         * 20,22,24,26,28
 518         * 30,32,34,36,38
 519         * 40,42,44,46,48
 520         * 50,52,54,56,58
 521         */
 522        .rept 25
 523                RR      j
 524                .set j, j+2
 525        .endr
 526
 527        /* Update Counter */
 528        sub $1, BLOCKS_CTR
 529        /* Move to the next block only if needed*/
 530        ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
 531        /*
 532         * rounds
 533         * 60,62,64,66,68
 534         * 70,72,74,76,78
 535         */
 536        .rept 10
 537                RR      j
 538                .set j, j+2
 539        .endr
 540
 541        UPDATE_HASH     (HASH_PTR), A
 542        UPDATE_HASH     4(HASH_PTR), TB
 543        UPDATE_HASH     8(HASH_PTR), C
 544        UPDATE_HASH     12(HASH_PTR), D
 545        UPDATE_HASH     16(HASH_PTR), E
 546
 547        test    BLOCKS_CTR, BLOCKS_CTR
 548        jz      _loop
 549
 550        mov     TB, B
 551
 552        /* Process second block */
 553        /*
 554         * rounds
 555         *  0+80, 2+80, 4+80, 6+80, 8+80
 556         * 10+80,12+80,14+80,16+80,18+80
 557         */
 558
 559        .set j, 0
 560        .rept 10
 561                RR      j+80
 562                .set j, j+2
 563        .endr
 564
 565        jmp     _loop1
 566_loop1:
 567        /*
 568         * rounds
 569         * 20+80,22+80,24+80,26+80,28+80
 570         * 30+80,32+80,34+80,36+80,38+80
 571         */
 572        .rept 10
 573                RR      j+80
 574                .set j, j+2
 575        .endr
 576
 577        jmp     _loop2
 578_loop2:
 579
 580        /*
 581         * rounds
 582         * 40+80,42+80,44+80,46+80,48+80
 583         * 50+80,52+80,54+80,56+80,58+80
 584         */
 585        .rept 10
 586                RR      j+80
 587                .set j, j+2
 588        .endr
 589
 590        /* update counter */
 591        sub     $1, BLOCKS_CTR
 592        /* Move to the next block only if needed*/
 593        ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
 594
 595        jmp     _loop3
 596_loop3:
 597
 598        /*
 599         * rounds
 600         * 60+80,62+80,64+80,66+80,68+80
 601         * 70+80,72+80,74+80,76+80,78+80
 602         */
 603        .rept 10
 604                RR      j+80
 605                .set j, j+2
 606        .endr
 607
 608        UPDATE_HASH     (HASH_PTR), A
 609        UPDATE_HASH     4(HASH_PTR), TB
 610        UPDATE_HASH     8(HASH_PTR), C
 611        UPDATE_HASH     12(HASH_PTR), D
 612        UPDATE_HASH     16(HASH_PTR), E
 613
 614        /* Reset state for AVX2 reg permutation */
 615        mov     A, TA
 616        mov     TB, A
 617        mov     C, TB
 618        mov     E, C
 619        mov     D, B
 620        mov     TA, D
 621
 622        REGALLOC
 623
 624        xchg    WK_BUF, PRECALC_BUF
 625
 626        jmp     _loop
 627
 628        .align 32
 629        _end:
 630
 631.endm
 632/*
 633 * macro implements SHA-1 function's body for several 64-byte blocks
 634 * param: function's name
 635 */
 636.macro SHA1_VECTOR_ASM  name
 637        SYM_FUNC_START(\name)
 638
 639        push    %rbx
 640        push    %r12
 641        push    %r13
 642        push    %r14
 643        push    %r15
 644
 645        RESERVE_STACK  = (W_SIZE*4 + 8+24)
 646
 647        /* Align stack */
 648        mov     %rsp, %rbx
 649        and     $~(0x20-1), %rsp
 650        push    %rbx
 651        sub     $RESERVE_STACK, %rsp
 652
 653        avx2_zeroupper
 654
 655        /* Setup initial values */
 656        mov     CTX, HASH_PTR
 657        mov     BUF, BUFFER_PTR
 658
 659        mov     BUF, BUFFER_PTR2
 660        mov     CNT, BLOCKS_CTR
 661
 662        xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
 663
 664        SHA1_PIPELINED_MAIN_BODY
 665
 666        avx2_zeroupper
 667
 668        add     $RESERVE_STACK, %rsp
 669        pop     %rsp
 670
 671        pop     %r15
 672        pop     %r14
 673        pop     %r13
 674        pop     %r12
 675        pop     %rbx
 676
 677        ret
 678
 679        SYM_FUNC_END(\name)
 680.endm
 681
 682.section .rodata
 683
 684#define K1 0x5a827999
 685#define K2 0x6ed9eba1
 686#define K3 0x8f1bbcdc
 687#define K4 0xca62c1d6
 688
 689.align 128
 690K_XMM_AR:
 691        .long K1, K1, K1, K1
 692        .long K1, K1, K1, K1
 693        .long K2, K2, K2, K2
 694        .long K2, K2, K2, K2
 695        .long K3, K3, K3, K3
 696        .long K3, K3, K3, K3
 697        .long K4, K4, K4, K4
 698        .long K4, K4, K4, K4
 699
 700BSWAP_SHUFB_CTL:
 701        .long 0x00010203
 702        .long 0x04050607
 703        .long 0x08090a0b
 704        .long 0x0c0d0e0f
 705        .long 0x00010203
 706        .long 0x04050607
 707        .long 0x08090a0b
 708        .long 0x0c0d0e0f
 709.text
 710
 711SHA1_VECTOR_ASM     sha1_transform_avx2
 712