linux/arch/x86/crypto/sha256-avx-asm.S
<<
>>
Prefs
   1########################################################################
   2# Implement fast SHA-256 with AVX1 instructions. (x86_64)
   3#
   4# Copyright (C) 2013 Intel Corporation.
   5#
   6# Authors:
   7#     James Guilford <james.guilford@intel.com>
   8#     Kirk Yap <kirk.s.yap@intel.com>
   9#     Tim Chen <tim.c.chen@linux.intel.com>
  10#
  11# This software is available to you under a choice of one of two
  12# licenses.  You may choose to be licensed under the terms of the GNU
  13# General Public License (GPL) Version 2, available from the file
  14# COPYING in the main directory of this source tree, or the
  15# OpenIB.org BSD license below:
  16#
  17#     Redistribution and use in source and binary forms, with or
  18#     without modification, are permitted provided that the following
  19#     conditions are met:
  20#
  21#      - Redistributions of source code must retain the above
  22#        copyright notice, this list of conditions and the following
  23#        disclaimer.
  24#
  25#      - Redistributions in binary form must reproduce the above
  26#        copyright notice, this list of conditions and the following
  27#        disclaimer in the documentation and/or other materials
  28#        provided with the distribution.
  29#
  30# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  31# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  32# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  33# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  34# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  35# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  36# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  37# SOFTWARE.
  38########################################################################
  39#
  40# This code is described in an Intel White-Paper:
  41# "Fast SHA-256 Implementations on Intel Architecture Processors"
  42#
  43# To find it, surf to http://www.intel.com/p/en_US/embedded
  44# and search for that title.
  45#
  46########################################################################
  47# This code schedules 1 block at a time, with 4 lanes per block
  48########################################################################
  49
  50#include <linux/linkage.h>
  51
  52## assume buffers not aligned
  53#define    VMOVDQ vmovdqu
  54
  55################################ Define Macros
  56
  57# addm [mem], reg
  58# Add reg to mem using reg-mem add and store
  59.macro addm p1 p2
  60        add     \p1, \p2
  61        mov     \p2, \p1
  62.endm
  63
  64
  65.macro MY_ROR p1 p2
  66        shld    $(32-(\p1)), \p2, \p2
  67.endm
  68
  69################################
  70
  71# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
  72# Load xmm with mem and byte swap each dword
  73.macro COPY_XMM_AND_BSWAP p1 p2 p3
  74        VMOVDQ \p2, \p1
  75        vpshufb \p3, \p1, \p1
  76.endm
  77
  78################################
  79
  80X0 = %xmm4
  81X1 = %xmm5
  82X2 = %xmm6
  83X3 = %xmm7
  84
  85XTMP0 = %xmm0
  86XTMP1 = %xmm1
  87XTMP2 = %xmm2
  88XTMP3 = %xmm3
  89XTMP4 = %xmm8
  90XFER = %xmm9
  91XTMP5 = %xmm11
  92
  93SHUF_00BA = %xmm10      # shuffle xBxA -> 00BA
  94SHUF_DC00 = %xmm12      # shuffle xDxC -> DC00
  95BYTE_FLIP_MASK = %xmm13
  96
  97NUM_BLKS = %rdx   # 3rd arg
  98INP = %rsi        # 2nd arg
  99CTX = %rdi        # 1st arg
 100
 101SRND = %rsi       # clobbers INP
 102c = %ecx
 103d = %r8d
 104e = %edx
 105TBL = %r12
 106a = %eax
 107b = %ebx
 108
 109f = %r9d
 110g = %r10d
 111h = %r11d
 112
 113y0 = %r13d
 114y1 = %r14d
 115y2 = %r15d
 116
 117
 118_INP_END_SIZE = 8
 119_INP_SIZE = 8
 120_XFER_SIZE = 16
 121_XMM_SAVE_SIZE = 0
 122
 123_INP_END = 0
 124_INP            = _INP_END  + _INP_END_SIZE
 125_XFER           = _INP      + _INP_SIZE
 126_XMM_SAVE       = _XFER     + _XFER_SIZE
 127STACK_SIZE      = _XMM_SAVE + _XMM_SAVE_SIZE
 128
 129# rotate_Xs
 130# Rotate values of symbols X0...X3
 131.macro rotate_Xs
 132X_ = X0
 133X0 = X1
 134X1 = X2
 135X2 = X3
 136X3 = X_
 137.endm
 138
 139# ROTATE_ARGS
 140# Rotate values of symbols a...h
 141.macro ROTATE_ARGS
 142TMP_ = h
 143h = g
 144g = f
 145f = e
 146e = d
 147d = c
 148c = b
 149b = a
 150a = TMP_
 151.endm
 152
 153.macro FOUR_ROUNDS_AND_SCHED
 154        ## compute s0 four at a time and s1 two at a time
 155        ## compute W[-16] + W[-7] 4 at a time
 156
 157        mov     e, y0                   # y0 = e
 158        MY_ROR  (25-11), y0             # y0 = e >> (25-11)
 159        mov     a, y1                   # y1 = a
 160        vpalignr $4, X2, X3, XTMP0      # XTMP0 = W[-7]
 161        MY_ROR  (22-13), y1             # y1 = a >> (22-13)
 162        xor     e, y0                   # y0 = e ^ (e >> (25-11))
 163        mov     f, y2                   # y2 = f
 164        MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
 165        xor     a, y1                   # y1 = a ^ (a >> (22-13)
 166        xor     g, y2                   # y2 = f^g
 167        vpaddd  X0, XTMP0, XTMP0        # XTMP0 = W[-7] + W[-16]
 168        xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
 169        and     e, y2                   # y2 = (f^g)&e
 170        MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
 171        ## compute s0
 172        vpalignr $4, X0, X1, XTMP1      # XTMP1 = W[-15]
 173        xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
 174        MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
 175        xor     g, y2                   # y2 = CH = ((f^g)&e)^g
 176        MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
 177        add     y0, y2                  # y2 = S1 + CH
 178        add     _XFER(%rsp), y2         # y2 = k + w + S1 + CH
 179        mov     a, y0                   # y0 = a
 180        add     y2, h                   # h = h + S1 + CH + k + w
 181        mov     a, y2                   # y2 = a
 182        vpsrld  $7, XTMP1, XTMP2
 183        or      c, y0                   # y0 = a|c
 184        add     h, d                    # d = d + h + S1 + CH + k + w
 185        and     c, y2                   # y2 = a&c
 186        vpslld  $(32-7), XTMP1, XTMP3
 187        and     b, y0                   # y0 = (a|c)&b
 188        add     y1, h                   # h = h + S1 + CH + k + w + S0
 189        vpor    XTMP2, XTMP3, XTMP3     # XTMP1 = W[-15] MY_ROR 7
 190        or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
 191        add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
 192        ROTATE_ARGS
 193        mov     e, y0                   # y0 = e
 194        mov     a, y1                   # y1 = a
 195        MY_ROR  (25-11), y0             # y0 = e >> (25-11)
 196        xor     e, y0                   # y0 = e ^ (e >> (25-11))
 197        mov     f, y2                   # y2 = f
 198        MY_ROR  (22-13), y1             # y1 = a >> (22-13)
 199        vpsrld  $18, XTMP1, XTMP2       #
 200        xor     a, y1                   # y1 = a ^ (a >> (22-13)
 201        MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
 202        xor     g, y2                   # y2 = f^g
 203        vpsrld  $3, XTMP1, XTMP4        # XTMP4 = W[-15] >> 3
 204        MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
 205        xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
 206        and     e, y2                   # y2 = (f^g)&e
 207        MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
 208        vpslld  $(32-18), XTMP1, XTMP1
 209        xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
 210        xor     g, y2                   # y2 = CH = ((f^g)&e)^g
 211        vpxor   XTMP1, XTMP3, XTMP3     #
 212        add     y0, y2                  # y2 = S1 + CH
 213        add     (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
 214        MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
 215        vpxor   XTMP2, XTMP3, XTMP3     # XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR
 216        mov     a, y0                   # y0 = a
 217        add     y2, h                   # h = h + S1 + CH + k + w
 218        mov     a, y2                   # y2 = a
 219        vpxor   XTMP4, XTMP3, XTMP1     # XTMP1 = s0
 220        or      c, y0                   # y0 = a|c
 221        add     h, d                    # d = d + h + S1 + CH + k + w
 222        and     c, y2                   # y2 = a&c
 223        ## compute low s1
 224        vpshufd $0b11111010, X3, XTMP2  # XTMP2 = W[-2] {BBAA}
 225        and     b, y0                   # y0 = (a|c)&b
 226        add     y1, h                   # h = h + S1 + CH + k + w + S0
 227        vpaddd  XTMP1, XTMP0, XTMP0     # XTMP0 = W[-16] + W[-7] + s0
 228        or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
 229        add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
 230        ROTATE_ARGS
 231        mov     e, y0                   # y0 = e
 232        mov     a, y1                   # y1 = a
 233        MY_ROR  (25-11), y0             # y0 = e >> (25-11)
 234        xor     e, y0                   # y0 = e ^ (e >> (25-11))
 235        MY_ROR  (22-13), y1             # y1 = a >> (22-13)
 236        mov     f, y2                   # y2 = f
 237        xor     a, y1                   # y1 = a ^ (a >> (22-13)
 238        MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
 239        vpsrld  $10, XTMP2, XTMP4       # XTMP4 = W[-2] >> 10 {BBAA}
 240        xor     g, y2                   # y2 = f^g
 241        vpsrlq  $19, XTMP2, XTMP3       # XTMP3 = W[-2] MY_ROR 19 {xBxA}
 242        xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
 243        and     e, y2                   # y2 = (f^g)&e
 244        vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] MY_ROR 17 {xBxA}
 245        MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
 246        xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
 247        xor     g, y2                   # y2 = CH = ((f^g)&e)^g
 248        MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
 249        vpxor   XTMP3, XTMP2, XTMP2     #
 250        add     y0, y2                  # y2 = S1 + CH
 251        MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
 252        add     (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
 253        vpxor   XTMP2, XTMP4, XTMP4     # XTMP4 = s1 {xBxA}
 254        mov     a, y0                   # y0 = a
 255        add     y2, h                   # h = h + S1 + CH + k + w
 256        mov     a, y2                   # y2 = a
 257        vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
 258        or      c, y0                   # y0 = a|c
 259        add     h, d                    # d = d + h + S1 + CH + k + w
 260        and     c, y2                   # y2 = a&c
 261        vpaddd  XTMP4, XTMP0, XTMP0     # XTMP0 = {..., ..., W[1], W[0]}
 262        and     b, y0                   # y0 = (a|c)&b
 263        add     y1, h                   # h = h + S1 + CH + k + w + S0
 264        ## compute high s1
 265        vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
 266        or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
 267        add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
 268        ROTATE_ARGS
 269        mov     e, y0                   # y0 = e
 270        MY_ROR  (25-11), y0             # y0 = e >> (25-11)
 271        mov     a, y1                   # y1 = a
 272        MY_ROR  (22-13), y1             # y1 = a >> (22-13)
 273        xor     e, y0                   # y0 = e ^ (e >> (25-11))
 274        mov     f, y2                   # y2 = f
 275        MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
 276        vpsrld  $10, XTMP2, XTMP5       # XTMP5 = W[-2] >> 10 {DDCC}
 277        xor     a, y1                   # y1 = a ^ (a >> (22-13)
 278        xor     g, y2                   # y2 = f^g
 279        vpsrlq  $19, XTMP2, XTMP3       # XTMP3 = W[-2] MY_ROR 19 {xDxC}
 280        xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
 281        and     e, y2                   # y2 = (f^g)&e
 282        MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
 283        vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] MY_ROR 17 {xDxC}
 284        xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
 285        MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
 286        xor     g, y2                   # y2 = CH = ((f^g)&e)^g
 287        vpxor   XTMP3, XTMP2, XTMP2
 288        MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
 289        add     y0, y2                  # y2 = S1 + CH
 290        add     (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
 291        vpxor   XTMP2, XTMP5, XTMP5     # XTMP5 = s1 {xDxC}
 292        mov     a, y0                   # y0 = a
 293        add     y2, h                   # h = h + S1 + CH + k + w
 294        mov     a, y2                   # y2 = a
 295        vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
 296        or      c, y0                   # y0 = a|c
 297        add     h, d                    # d = d + h + S1 + CH + k + w
 298        and     c, y2                   # y2 = a&c
 299        vpaddd  XTMP0, XTMP5, X0        # X0 = {W[3], W[2], W[1], W[0]}
 300        and     b, y0                   # y0 = (a|c)&b
 301        add     y1, h                   # h = h + S1 + CH + k + w + S0
 302        or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
 303        add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
 304        ROTATE_ARGS
 305        rotate_Xs
 306.endm
 307
 308## input is [rsp + _XFER + %1 * 4]
 309.macro DO_ROUND round
 310        mov     e, y0                   # y0 = e
 311        MY_ROR  (25-11), y0             # y0 = e >> (25-11)
 312        mov     a, y1                   # y1 = a
 313        xor     e, y0                   # y0 = e ^ (e >> (25-11))
 314        MY_ROR  (22-13), y1             # y1 = a >> (22-13)
 315        mov     f, y2                   # y2 = f
 316        xor     a, y1                   # y1 = a ^ (a >> (22-13)
 317        MY_ROR  (11-6), y0              # y0 = (e >> (11-6)) ^ (e >> (25-6))
 318        xor     g, y2                   # y2 = f^g
 319        xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
 320        MY_ROR  (13-2), y1              # y1 = (a >> (13-2)) ^ (a >> (22-2))
 321        and     e, y2                   # y2 = (f^g)&e
 322        xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
 323        MY_ROR  6, y0                   # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
 324        xor     g, y2                   # y2 = CH = ((f^g)&e)^g
 325        add     y0, y2                  # y2 = S1 + CH
 326        MY_ROR  2, y1                   # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
 327        offset = \round * 4 + _XFER     #
 328        add     offset(%rsp), y2        # y2 = k + w + S1 + CH
 329        mov     a, y0                   # y0 = a
 330        add     y2, h                   # h = h + S1 + CH + k + w
 331        mov     a, y2                   # y2 = a
 332        or      c, y0                   # y0 = a|c
 333        add     h, d                    # d = d + h + S1 + CH + k + w
 334        and     c, y2                   # y2 = a&c
 335        and     b, y0                   # y0 = (a|c)&b
 336        add     y1, h                   # h = h + S1 + CH + k + w + S0
 337        or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
 338        add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
 339        ROTATE_ARGS
 340.endm
 341
 342########################################################################
 343## void sha256_transform_avx(state sha256_state *state, const u8 *data, int blocks)
 344## arg 1 : pointer to state
 345## arg 2 : pointer to input data
 346## arg 3 : Num blocks
 347########################################################################
 348.text
 349SYM_FUNC_START(sha256_transform_avx)
 350.align 32
 351        pushq   %rbx
 352        pushq   %r12
 353        pushq   %r13
 354        pushq   %r14
 355        pushq   %r15
 356        pushq   %rbp
 357        movq    %rsp, %rbp
 358
 359        subq    $STACK_SIZE, %rsp       # allocate stack space
 360        and     $~15, %rsp              # align stack pointer
 361
 362        shl     $6, NUM_BLKS            # convert to bytes
 363        jz      done_hash
 364        add     INP, NUM_BLKS           # pointer to end of data
 365        mov     NUM_BLKS, _INP_END(%rsp)
 366
 367        ## load initial digest
 368        mov     4*0(CTX), a
 369        mov     4*1(CTX), b
 370        mov     4*2(CTX), c
 371        mov     4*3(CTX), d
 372        mov     4*4(CTX), e
 373        mov     4*5(CTX), f
 374        mov     4*6(CTX), g
 375        mov     4*7(CTX), h
 376
 377        vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
 378        vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
 379        vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
 380loop0:
 381        lea     K256(%rip), TBL
 382
 383        ## byte swap first 16 dwords
 384        COPY_XMM_AND_BSWAP      X0, 0*16(INP), BYTE_FLIP_MASK
 385        COPY_XMM_AND_BSWAP      X1, 1*16(INP), BYTE_FLIP_MASK
 386        COPY_XMM_AND_BSWAP      X2, 2*16(INP), BYTE_FLIP_MASK
 387        COPY_XMM_AND_BSWAP      X3, 3*16(INP), BYTE_FLIP_MASK
 388
 389        mov     INP, _INP(%rsp)
 390
 391        ## schedule 48 input dwords, by doing 3 rounds of 16 each
 392        mov     $3, SRND
 393.align 16
 394loop1:
 395        vpaddd  (TBL), X0, XFER
 396        vmovdqa XFER, _XFER(%rsp)
 397        FOUR_ROUNDS_AND_SCHED
 398
 399        vpaddd  1*16(TBL), X0, XFER
 400        vmovdqa XFER, _XFER(%rsp)
 401        FOUR_ROUNDS_AND_SCHED
 402
 403        vpaddd  2*16(TBL), X0, XFER
 404        vmovdqa XFER, _XFER(%rsp)
 405        FOUR_ROUNDS_AND_SCHED
 406
 407        vpaddd  3*16(TBL), X0, XFER
 408        vmovdqa XFER, _XFER(%rsp)
 409        add     $4*16, TBL
 410        FOUR_ROUNDS_AND_SCHED
 411
 412        sub     $1, SRND
 413        jne     loop1
 414
 415        mov     $2, SRND
 416loop2:
 417        vpaddd  (TBL), X0, XFER
 418        vmovdqa XFER, _XFER(%rsp)
 419        DO_ROUND        0
 420        DO_ROUND        1
 421        DO_ROUND        2
 422        DO_ROUND        3
 423
 424        vpaddd  1*16(TBL), X1, XFER
 425        vmovdqa XFER, _XFER(%rsp)
 426        add     $2*16, TBL
 427        DO_ROUND        0
 428        DO_ROUND        1
 429        DO_ROUND        2
 430        DO_ROUND        3
 431
 432        vmovdqa X2, X0
 433        vmovdqa X3, X1
 434
 435        sub     $1, SRND
 436        jne     loop2
 437
 438        addm    (4*0)(CTX),a
 439        addm    (4*1)(CTX),b
 440        addm    (4*2)(CTX),c
 441        addm    (4*3)(CTX),d
 442        addm    (4*4)(CTX),e
 443        addm    (4*5)(CTX),f
 444        addm    (4*6)(CTX),g
 445        addm    (4*7)(CTX),h
 446
 447        mov     _INP(%rsp), INP
 448        add     $64, INP
 449        cmp     _INP_END(%rsp), INP
 450        jne     loop0
 451
 452done_hash:
 453
 454        mov     %rbp, %rsp
 455        popq    %rbp
 456        popq    %r15
 457        popq    %r14
 458        popq    %r13
 459        popq    %r12
 460        popq    %rbx
 461        ret
 462SYM_FUNC_END(sha256_transform_avx)
 463
 464.section        .rodata.cst256.K256, "aM", @progbits, 256
 465.align 64
 466K256:
 467        .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 468        .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 469        .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 470        .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 471        .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 472        .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 473        .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 474        .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 475        .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 476        .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 477        .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 478        .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 479        .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 480        .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 481        .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 482        .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 483
 484.section        .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
 485.align 16
 486PSHUFFLE_BYTE_FLIP_MASK:
 487        .octa 0x0c0d0e0f08090a0b0405060700010203
 488
 489.section        .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
 490.align 16
 491# shuffle xBxA -> 00BA
 492_SHUF_00BA:
 493        .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
 494
 495.section        .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
 496.align 16
 497# shuffle xDxC -> DC00
 498_SHUF_DC00:
 499        .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
 500