linux/arch/x86/crypto/sha256-ssse3-asm.S
<<
>>
Prefs
   1########################################################################
   2# Implement fast SHA-256 with SSSE3 instructions. (x86_64)
   3#
   4# Copyright (C) 2013 Intel Corporation.
   5#
   6# Authors:
   7#     James Guilford <james.guilford@intel.com>
   8#     Kirk Yap <kirk.s.yap@intel.com>
   9#     Tim Chen <tim.c.chen@linux.intel.com>
  10#
  11# This software is available to you under a choice of one of two
  12# licenses.  You may choose to be licensed under the terms of the GNU
  13# General Public License (GPL) Version 2, available from the file
  14# COPYING in the main directory of this source tree, or the
  15# OpenIB.org BSD license below:
  16#
  17#     Redistribution and use in source and binary forms, with or
  18#     without modification, are permitted provided that the following
  19#     conditions are met:
  20#
  21#      - Redistributions of source code must retain the above
  22#        copyright notice, this list of conditions and the following
  23#        disclaimer.
  24#
  25#      - Redistributions in binary form must reproduce the above
  26#        copyright notice, this list of conditions and the following
  27#        disclaimer in the documentation and/or other materials
  28#        provided with the distribution.
  29#
  30# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  31# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  32# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  33# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  34# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  35# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  36# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  37# SOFTWARE.
  38#
  39########################################################################
  40#
  41# This code is described in an Intel White-Paper:
  42# "Fast SHA-256 Implementations on Intel Architecture Processors"
  43#
  44# To find it, surf to http://www.intel.com/p/en_US/embedded
  45# and search for that title.
  46#
  47########################################################################
  48
  49#include <linux/linkage.h>
  50
  51## assume buffers not aligned
  52#define    MOVDQ movdqu
  53
  54################################ Define Macros
  55
  56# addm [mem], reg
  57# Add reg to mem using reg-mem add and store
  58.macro addm p1 p2
  59        add     \p1, \p2
  60        mov     \p2, \p1
  61.endm
  62
  63################################
  64
  65# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
  66# Load xmm with mem and byte swap each dword
  67.macro COPY_XMM_AND_BSWAP p1 p2 p3
  68        MOVDQ \p2, \p1
  69        pshufb \p3, \p1
  70.endm
  71
  72################################
  73
  74X0 = %xmm4
  75X1 = %xmm5
  76X2 = %xmm6
  77X3 = %xmm7
  78
  79XTMP0 = %xmm0
  80XTMP1 = %xmm1
  81XTMP2 = %xmm2
  82XTMP3 = %xmm3
  83XTMP4 = %xmm8
  84XFER = %xmm9
  85
  86SHUF_00BA = %xmm10      # shuffle xBxA -> 00BA
  87SHUF_DC00 = %xmm11      # shuffle xDxC -> DC00
  88BYTE_FLIP_MASK = %xmm12
  89
  90NUM_BLKS = %rdx   # 3rd arg
  91INP = %rsi        # 2nd arg
  92CTX = %rdi        # 1st arg
  93
  94SRND = %rsi       # clobbers INP
  95c = %ecx
  96d = %r8d
  97e = %edx
  98TBL = %r12
  99a = %eax
 100b = %ebx
 101
 102f = %r9d
 103g = %r10d
 104h = %r11d
 105
 106y0 = %r13d
 107y1 = %r14d
 108y2 = %r15d
 109
 110
 111
 112_INP_END_SIZE = 8
 113_INP_SIZE = 8
 114_XFER_SIZE = 16
 115_XMM_SAVE_SIZE = 0
 116
 117_INP_END = 0
 118_INP            = _INP_END  + _INP_END_SIZE
 119_XFER           = _INP      + _INP_SIZE
 120_XMM_SAVE       = _XFER     + _XFER_SIZE
 121STACK_SIZE      = _XMM_SAVE + _XMM_SAVE_SIZE
 122
 123# rotate_Xs
 124# Rotate values of symbols X0...X3
 125.macro rotate_Xs
 126X_ = X0
 127X0 = X1
 128X1 = X2
 129X2 = X3
 130X3 = X_
 131.endm
 132
 133# ROTATE_ARGS
 134# Rotate values of symbols a...h
 135.macro ROTATE_ARGS
 136TMP_ = h
 137h = g
 138g = f
 139f = e
 140e = d
 141d = c
 142c = b
 143b = a
 144a = TMP_
 145.endm
 146
 147.macro FOUR_ROUNDS_AND_SCHED
 148        ## compute s0 four at a time and s1 two at a time
 149        ## compute W[-16] + W[-7] 4 at a time
 150        movdqa  X3, XTMP0
 151        mov     e, y0                   # y0 = e
 152        ror     $(25-11), y0            # y0 = e >> (25-11)
 153        mov     a, y1                   # y1 = a
 154        palignr $4, X2, XTMP0           # XTMP0 = W[-7]
 155        ror     $(22-13), y1            # y1 = a >> (22-13)
 156        xor     e, y0                   # y0 = e ^ (e >> (25-11))
 157        mov     f, y2                   # y2 = f
 158        ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
 159        movdqa  X1, XTMP1
 160        xor     a, y1                   # y1 = a ^ (a >> (22-13)
 161        xor     g, y2                   # y2 = f^g
 162        paddd   X0, XTMP0               # XTMP0 = W[-7] + W[-16]
 163        xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
 164        and     e, y2                   # y2 = (f^g)&e
 165        ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
 166        ## compute s0
 167        palignr $4, X0, XTMP1           # XTMP1 = W[-15]
 168        xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
 169        ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
 170        xor     g, y2                   # y2 = CH = ((f^g)&e)^g
 171        movdqa  XTMP1, XTMP2            # XTMP2 = W[-15]
 172        ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
 173        add     y0, y2                  # y2 = S1 + CH
 174        add     _XFER(%rsp) , y2        # y2 = k + w + S1 + CH
 175        movdqa  XTMP1, XTMP3            # XTMP3 = W[-15]
 176        mov     a, y0                   # y0 = a
 177        add     y2, h                   # h = h + S1 + CH + k + w
 178        mov     a, y2                   # y2 = a
 179        pslld   $(32-7), XTMP1          #
 180        or      c, y0                   # y0 = a|c
 181        add     h, d                    # d = d + h + S1 + CH + k + w
 182        and     c, y2                   # y2 = a&c
 183        psrld   $7, XTMP2               #
 184        and     b, y0                   # y0 = (a|c)&b
 185        add     y1, h                   # h = h + S1 + CH + k + w + S0
 186        por     XTMP2, XTMP1            # XTMP1 = W[-15] ror 7
 187        or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
 188        add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
 189                                        #
 190        ROTATE_ARGS                     #
 191        movdqa  XTMP3, XTMP2            # XTMP2 = W[-15]
 192        mov     e, y0                   # y0 = e
 193        mov     a, y1                   # y1 = a
 194        movdqa  XTMP3, XTMP4            # XTMP4 = W[-15]
 195        ror     $(25-11), y0            # y0 = e >> (25-11)
 196        xor     e, y0                   # y0 = e ^ (e >> (25-11))
 197        mov     f, y2                   # y2 = f
 198        ror     $(22-13), y1            # y1 = a >> (22-13)
 199        pslld   $(32-18), XTMP3         #
 200        xor     a, y1                   # y1 = a ^ (a >> (22-13)
 201        ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
 202        xor     g, y2                   # y2 = f^g
 203        psrld   $18, XTMP2              #
 204        ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
 205        xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
 206        and     e, y2                   # y2 = (f^g)&e
 207        ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
 208        pxor    XTMP3, XTMP1
 209        xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
 210        xor     g, y2                   # y2 = CH = ((f^g)&e)^g
 211        psrld   $3, XTMP4               # XTMP4 = W[-15] >> 3
 212        add     y0, y2                  # y2 = S1 + CH
 213        add     (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
 214        ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
 215        pxor    XTMP2, XTMP1            # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
 216        mov     a, y0                   # y0 = a
 217        add     y2, h                   # h = h + S1 + CH + k + w
 218        mov     a, y2                   # y2 = a
 219        pxor    XTMP4, XTMP1            # XTMP1 = s0
 220        or      c, y0                   # y0 = a|c
 221        add     h, d                    # d = d + h + S1 + CH + k + w
 222        and     c, y2                   # y2 = a&c
 223        ## compute low s1
 224        pshufd  $0b11111010, X3, XTMP2   # XTMP2 = W[-2] {BBAA}
 225        and     b, y0                   # y0 = (a|c)&b
 226        add     y1, h                   # h = h + S1 + CH + k + w + S0
 227        paddd   XTMP1, XTMP0            # XTMP0 = W[-16] + W[-7] + s0
 228        or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
 229        add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
 230
 231        ROTATE_ARGS
 232        movdqa  XTMP2, XTMP3            # XTMP3 = W[-2] {BBAA}
 233        mov     e, y0                   # y0 = e
 234        mov     a, y1                   # y1 = a
 235        ror     $(25-11), y0            # y0 = e >> (25-11)
 236        movdqa  XTMP2, XTMP4            # XTMP4 = W[-2] {BBAA}
 237        xor     e, y0                   # y0 = e ^ (e >> (25-11))
 238        ror     $(22-13), y1            # y1 = a >> (22-13)
 239        mov     f, y2                   # y2 = f
 240        xor     a, y1                   # y1 = a ^ (a >> (22-13)
 241        ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
 242        psrlq   $17, XTMP2              # XTMP2 = W[-2] ror 17 {xBxA}
 243        xor     g, y2                   # y2 = f^g
 244        psrlq   $19, XTMP3              # XTMP3 = W[-2] ror 19 {xBxA}
 245        xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
 246        and     e, y2                   # y2 = (f^g)&e
 247        psrld   $10, XTMP4              # XTMP4 = W[-2] >> 10 {BBAA}
 248        ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
 249        xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
 250        xor     g, y2                   # y2 = CH = ((f^g)&e)^g
 251        ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
 252        pxor    XTMP3, XTMP2
 253        add     y0, y2                  # y2 = S1 + CH
 254        ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
 255        add     (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
 256        pxor    XTMP2, XTMP4            # XTMP4 = s1 {xBxA}
 257        mov     a, y0                   # y0 = a
 258        add     y2, h                   # h = h + S1 + CH + k + w
 259        mov     a, y2                   # y2 = a
 260        pshufb  SHUF_00BA, XTMP4        # XTMP4 = s1 {00BA}
 261        or      c, y0                   # y0 = a|c
 262        add     h, d                    # d = d + h + S1 + CH + k + w
 263        and     c, y2                   # y2 = a&c
 264        paddd   XTMP4, XTMP0            # XTMP0 = {..., ..., W[1], W[0]}
 265        and     b, y0                   # y0 = (a|c)&b
 266        add     y1, h                   # h = h + S1 + CH + k + w + S0
 267        ## compute high s1
 268        pshufd  $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA}
 269        or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
 270        add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
 271                                        #
 272        ROTATE_ARGS                     #
 273        movdqa  XTMP2, XTMP3            # XTMP3 = W[-2] {DDCC}
 274        mov     e, y0                   # y0 = e
 275        ror     $(25-11), y0            # y0 = e >> (25-11)
 276        mov     a, y1                   # y1 = a
 277        movdqa  XTMP2, X0               # X0    = W[-2] {DDCC}
 278        ror     $(22-13), y1            # y1 = a >> (22-13)
 279        xor     e, y0                   # y0 = e ^ (e >> (25-11))
 280        mov     f, y2                   # y2 = f
 281        ror     $(11-6), y0             # y0 = (e >> (11-6)) ^ (e >> (25-6))
 282        psrlq   $17, XTMP2              # XTMP2 = W[-2] ror 17 {xDxC}
 283        xor     a, y1                   # y1 = a ^ (a >> (22-13)
 284        xor     g, y2                   # y2 = f^g
 285        psrlq   $19, XTMP3              # XTMP3 = W[-2] ror 19 {xDxC}
 286        xor     e, y0                   # y0 = e ^ (e >> (11-6)) ^ (e >> (25
 287        and     e, y2                   # y2 = (f^g)&e
 288        ror     $(13-2), y1             # y1 = (a >> (13-2)) ^ (a >> (22-2))
 289        psrld   $10, X0                 # X0 = W[-2] >> 10 {DDCC}
 290        xor     a, y1                   # y1 = a ^ (a >> (13-2)) ^ (a >> (22
 291        ror     $6, y0                  # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2
 292        xor     g, y2                   # y2 = CH = ((f^g)&e)^g
 293        pxor    XTMP3, XTMP2            #
 294        ror     $2, y1                  # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2
 295        add     y0, y2                  # y2 = S1 + CH
 296        add     (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
 297        pxor    XTMP2, X0               # X0 = s1 {xDxC}
 298        mov     a, y0                   # y0 = a
 299        add     y2, h                   # h = h + S1 + CH + k + w
 300        mov     a, y2                   # y2 = a
 301        pshufb  SHUF_DC00, X0           # X0 = s1 {DC00}
 302        or      c, y0                   # y0 = a|c
 303        add     h, d                    # d = d + h + S1 + CH + k + w
 304        and     c, y2                   # y2 = a&c
 305        paddd   XTMP0, X0               # X0 = {W[3], W[2], W[1], W[0]}
 306        and     b, y0                   # y0 = (a|c)&b
 307        add     y1, h                   # h = h + S1 + CH + k + w + S0
 308        or      y2, y0                  # y0 = MAJ = (a|c)&b)|(a&c)
 309        add     y0, h                   # h = h + S1 + CH + k + w + S0 + MAJ
 310
 311        ROTATE_ARGS
 312        rotate_Xs
 313.endm
 314
 315## input is [rsp + _XFER + %1 * 4]
 316.macro DO_ROUND round
 317        mov     e, y0                 # y0 = e
 318        ror     $(25-11), y0          # y0 = e >> (25-11)
 319        mov     a, y1                 # y1 = a
 320        xor     e, y0                 # y0 = e ^ (e >> (25-11))
 321        ror     $(22-13), y1          # y1 = a >> (22-13)
 322        mov     f, y2                 # y2 = f
 323        xor     a, y1                 # y1 = a ^ (a >> (22-13)
 324        ror     $(11-6), y0           # y0 = (e >> (11-6)) ^ (e >> (25-6))
 325        xor     g, y2                 # y2 = f^g
 326        xor     e, y0                 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
 327        ror     $(13-2), y1           # y1 = (a >> (13-2)) ^ (a >> (22-2))
 328        and     e, y2                 # y2 = (f^g)&e
 329        xor     a, y1                 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
 330        ror     $6, y0                # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
 331        xor     g, y2                 # y2 = CH = ((f^g)&e)^g
 332        add     y0, y2                # y2 = S1 + CH
 333        ror     $2, y1                # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
 334        offset = \round * 4 + _XFER
 335        add     offset(%rsp), y2      # y2 = k + w + S1 + CH
 336        mov     a, y0                 # y0 = a
 337        add     y2, h                 # h = h + S1 + CH + k + w
 338        mov     a, y2                 # y2 = a
 339        or      c, y0                 # y0 = a|c
 340        add     h, d                  # d = d + h + S1 + CH + k + w
 341        and     c, y2                 # y2 = a&c
 342        and     b, y0                 # y0 = (a|c)&b
 343        add     y1, h                 # h = h + S1 + CH + k + w + S0
 344        or      y2, y0                # y0 = MAJ = (a|c)&b)|(a&c)
 345        add     y0, h                 # h = h + S1 + CH + k + w + S0 + MAJ
 346        ROTATE_ARGS
 347.endm
 348
 349########################################################################
 350## void sha256_transform_ssse3(void *input_data, UINT32 digest[8], UINT64 num_blks)
 351## arg 1 : pointer to digest
 352## arg 2 : pointer to input data
 353## arg 3 : Num blocks
 354########################################################################
 355.text
 356ENTRY(sha256_transform_ssse3)
 357.align 32
 358        pushq   %rbx
 359        pushq   %r12
 360        pushq   %r13
 361        pushq   %r14
 362        pushq   %r15
 363        pushq   %rbp
 364        mov     %rsp, %rbp
 365
 366        subq    $STACK_SIZE, %rsp
 367        and     $~15, %rsp
 368
 369        shl     $6, NUM_BLKS             # convert to bytes
 370        jz      done_hash
 371        add     INP, NUM_BLKS
 372        mov     NUM_BLKS, _INP_END(%rsp) # pointer to end of data
 373
 374        ## load initial digest
 375        mov     4*0(CTX), a
 376        mov     4*1(CTX), b
 377        mov     4*2(CTX), c
 378        mov     4*3(CTX), d
 379        mov     4*4(CTX), e
 380        mov     4*5(CTX), f
 381        mov     4*6(CTX), g
 382        mov     4*7(CTX), h
 383
 384        movdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
 385        movdqa  _SHUF_00BA(%rip), SHUF_00BA
 386        movdqa  _SHUF_DC00(%rip), SHUF_DC00
 387
 388loop0:
 389        lea     K256(%rip), TBL
 390
 391        ## byte swap first 16 dwords
 392        COPY_XMM_AND_BSWAP      X0, 0*16(INP), BYTE_FLIP_MASK
 393        COPY_XMM_AND_BSWAP      X1, 1*16(INP), BYTE_FLIP_MASK
 394        COPY_XMM_AND_BSWAP      X2, 2*16(INP), BYTE_FLIP_MASK
 395        COPY_XMM_AND_BSWAP      X3, 3*16(INP), BYTE_FLIP_MASK
 396
 397        mov     INP, _INP(%rsp)
 398
 399        ## schedule 48 input dwords, by doing 3 rounds of 16 each
 400        mov     $3, SRND
 401.align 16
 402loop1:
 403        movdqa  (TBL), XFER
 404        paddd   X0, XFER
 405        movdqa  XFER, _XFER(%rsp)
 406        FOUR_ROUNDS_AND_SCHED
 407
 408        movdqa  1*16(TBL), XFER
 409        paddd   X0, XFER
 410        movdqa  XFER, _XFER(%rsp)
 411        FOUR_ROUNDS_AND_SCHED
 412
 413        movdqa  2*16(TBL), XFER
 414        paddd   X0, XFER
 415        movdqa  XFER, _XFER(%rsp)
 416        FOUR_ROUNDS_AND_SCHED
 417
 418        movdqa  3*16(TBL), XFER
 419        paddd   X0, XFER
 420        movdqa  XFER, _XFER(%rsp)
 421        add     $4*16, TBL
 422        FOUR_ROUNDS_AND_SCHED
 423
 424        sub     $1, SRND
 425        jne     loop1
 426
 427        mov     $2, SRND
 428loop2:
 429        paddd   (TBL), X0
 430        movdqa  X0, _XFER(%rsp)
 431        DO_ROUND        0
 432        DO_ROUND        1
 433        DO_ROUND        2
 434        DO_ROUND        3
 435        paddd   1*16(TBL), X1
 436        movdqa  X1, _XFER(%rsp)
 437        add     $2*16, TBL
 438        DO_ROUND        0
 439        DO_ROUND        1
 440        DO_ROUND        2
 441        DO_ROUND        3
 442
 443        movdqa  X2, X0
 444        movdqa  X3, X1
 445
 446        sub     $1, SRND
 447        jne     loop2
 448
 449        addm    (4*0)(CTX),a
 450        addm    (4*1)(CTX),b
 451        addm    (4*2)(CTX),c
 452        addm    (4*3)(CTX),d
 453        addm    (4*4)(CTX),e
 454        addm    (4*5)(CTX),f
 455        addm    (4*6)(CTX),g
 456        addm    (4*7)(CTX),h
 457
 458        mov     _INP(%rsp), INP
 459        add     $64, INP
 460        cmp     _INP_END(%rsp), INP
 461        jne     loop0
 462
 463done_hash:
 464
 465        mov     %rbp, %rsp
 466        popq    %rbp
 467        popq    %r15
 468        popq    %r14
 469        popq    %r13
 470        popq    %r12
 471        popq    %rbx
 472
 473        ret
 474ENDPROC(sha256_transform_ssse3)
 475
 476.section        .rodata.cst256.K256, "aM", @progbits, 256
 477.align 64
 478K256:
 479        .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 480        .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 481        .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 482        .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 483        .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 484        .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 485        .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 486        .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 487        .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 488        .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 489        .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 490        .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 491        .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 492        .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 493        .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 494        .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 495
 496.section        .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
 497.align 16
 498PSHUFFLE_BYTE_FLIP_MASK:
 499        .octa 0x0c0d0e0f08090a0b0405060700010203
 500
 501.section        .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
 502.align 16
 503# shuffle xBxA -> 00BA
 504_SHUF_00BA:
 505        .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
 506
 507.section        .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
 508.align 16
 509# shuffle xDxC -> DC00
 510_SHUF_DC00:
 511        .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
 512