linux/arch/x86/crypto/sha256-mb/sha256_x8_avx2.S
<<
>>
Prefs
   1/*
   2 * Multi-buffer SHA256 algorithm hash compute routine
   3 *
   4 * This file is provided under a dual BSD/GPLv2 license.  When using or
   5 * redistributing this file, you may do so under either license.
   6 *
   7 * GPL LICENSE SUMMARY
   8 *
   9 *  Copyright(c) 2016 Intel Corporation.
  10 *
  11 *  This program is free software; you can redistribute it and/or modify
  12 *  it under the terms of version 2 of the GNU General Public License as
  13 *  published by the Free Software Foundation.
  14 *
  15 *  This program is distributed in the hope that it will be useful, but
  16 *  WITHOUT ANY WARRANTY; without even the implied warranty of
  17 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18 *  General Public License for more details.
  19 *
  20 *  Contact Information:
  21 *      Megha Dey <megha.dey@linux.intel.com>
  22 *
  23 *  BSD LICENSE
  24 *
  25 *  Copyright(c) 2016 Intel Corporation.
  26 *
  27 *  Redistribution and use in source and binary forms, with or without
  28 *  modification, are permitted provided that the following conditions
  29 *  are met:
  30 *
  31 *    * Redistributions of source code must retain the above copyright
  32 *      notice, this list of conditions and the following disclaimer.
  33 *    * Redistributions in binary form must reproduce the above copyright
  34 *      notice, this list of conditions and the following disclaimer in
  35 *      the documentation and/or other materials provided with the
  36 *      distribution.
  37 *    * Neither the name of Intel Corporation nor the names of its
  38 *      contributors may be used to endorse or promote products derived
  39 *      from this software without specific prior written permission.
  40 *
  41 *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  42 *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  43 *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  44 *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  45 *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  46 *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  47 *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  48 *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  49 *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  50 *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  51 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  52 */
  53
  54#include <linux/linkage.h>
  55#include "sha256_mb_mgr_datastruct.S"
  56
  57## code to compute oct SHA256 using SSE-256
  58## outer calling routine takes care of save and restore of XMM registers
  59## Logic designed/laid out by JDG
  60
  61## Function clobbers: rax, rcx, rdx,   rbx, rsi, rdi, r9-r15; %ymm0-15
  62## Linux clobbers:    rax rbx rcx rdx rsi            r9 r10 r11 r12 r13 r14 r15
  63## Linux preserves:                       rdi rbp r8
  64##
  65## clobbers %ymm0-15
  66
  67arg1 = %rdi
  68arg2 = %rsi
  69reg3 = %rcx
  70reg4 = %rdx
  71
  72# Common definitions
  73STATE = arg1
  74INP_SIZE = arg2
  75
  76IDX = %rax
  77ROUND = %rbx
  78TBL = reg3
  79
  80inp0 = %r9
  81inp1 = %r10
  82inp2 = %r11
  83inp3 = %r12
  84inp4 = %r13
  85inp5 = %r14
  86inp6 = %r15
  87inp7 = reg4
  88
  89a = %ymm0
  90b = %ymm1
  91c = %ymm2
  92d = %ymm3
  93e = %ymm4
  94f = %ymm5
  95g = %ymm6
  96h = %ymm7
  97
  98T1 = %ymm8
  99
 100a0 = %ymm12
 101a1 = %ymm13
 102a2 = %ymm14
 103TMP = %ymm15
 104TMP0 = %ymm6
 105TMP1 = %ymm7
 106
 107TT0 = %ymm8
 108TT1 = %ymm9
 109TT2 = %ymm10
 110TT3 = %ymm11
 111TT4 = %ymm12
 112TT5 = %ymm13
 113TT6 = %ymm14
 114TT7 = %ymm15
 115
 116# Define stack usage
 117
 118# Assume stack aligned to 32 bytes before call
 119# Therefore FRAMESZ mod 32 must be 32-8 = 24
 120
 121#define FRAMESZ 0x388
 122
 123#define VMOVPS  vmovups
 124
 125# TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
 126# "transpose" data in {r0...r7} using temps {t0...t1}
 127# Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
 128# r0 = {a7 a6 a5 a4   a3 a2 a1 a0}
 129# r1 = {b7 b6 b5 b4   b3 b2 b1 b0}
 130# r2 = {c7 c6 c5 c4   c3 c2 c1 c0}
 131# r3 = {d7 d6 d5 d4   d3 d2 d1 d0}
 132# r4 = {e7 e6 e5 e4   e3 e2 e1 e0}
 133# r5 = {f7 f6 f5 f4   f3 f2 f1 f0}
 134# r6 = {g7 g6 g5 g4   g3 g2 g1 g0}
 135# r7 = {h7 h6 h5 h4   h3 h2 h1 h0}
 136#
 137# Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
 138# r0 = {h0 g0 f0 e0   d0 c0 b0 a0}
 139# r1 = {h1 g1 f1 e1   d1 c1 b1 a1}
 140# r2 = {h2 g2 f2 e2   d2 c2 b2 a2}
 141# r3 = {h3 g3 f3 e3   d3 c3 b3 a3}
 142# r4 = {h4 g4 f4 e4   d4 c4 b4 a4}
 143# r5 = {h5 g5 f5 e5   d5 c5 b5 a5}
 144# r6 = {h6 g6 f6 e6   d6 c6 b6 a6}
 145# r7 = {h7 g7 f7 e7   d7 c7 b7 a7}
 146#
 147
 148.macro TRANSPOSE8 r0 r1 r2 r3 r4 r5 r6 r7 t0 t1
 149        # process top half (r0..r3) {a...d}
 150        vshufps $0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4   b1 b0 a1 a0}
 151        vshufps $0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6   b3 b2 a3 a2}
 152        vshufps $0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4   d1 d0 c1 c0}
 153        vshufps $0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6   d3 d2 c3 c2}
 154        vshufps $0xDD, \t1, \t0, \r3 # r3 = {d5 c5 b5 a5   d1 c1 b1 a1}
 155        vshufps $0x88, \r2, \r0, \r1 # r1 = {d6 c6 b6 a6   d2 c2 b2 a2}
 156        vshufps $0xDD, \r2, \r0, \r0 # r0 = {d7 c7 b7 a7   d3 c3 b3 a3}
 157        vshufps $0x88, \t1, \t0, \t0 # t0 = {d4 c4 b4 a4   d0 c0 b0 a0}
 158
 159        # use r2 in place of t0
 160        # process bottom half (r4..r7) {e...h}
 161        vshufps $0x44, \r5, \r4, \r2 # r2 = {f5 f4 e5 e4   f1 f0 e1 e0}
 162        vshufps $0xEE, \r5, \r4, \r4 # r4 = {f7 f6 e7 e6   f3 f2 e3 e2}
 163        vshufps $0x44, \r7, \r6, \t1 # t1 = {h5 h4 g5 g4   h1 h0 g1 g0}
 164        vshufps $0xEE, \r7, \r6, \r6 # r6 = {h7 h6 g7 g6   h3 h2 g3 g2}
 165        vshufps $0xDD, \t1, \r2, \r7 # r7 = {h5 g5 f5 e5   h1 g1 f1 e1}
 166        vshufps $0x88, \r6, \r4, \r5 # r5 = {h6 g6 f6 e6   h2 g2 f2 e2}
 167        vshufps $0xDD, \r6, \r4, \r4 # r4 = {h7 g7 f7 e7   h3 g3 f3 e3}
 168        vshufps $0x88, \t1, \r2, \t1 # t1 = {h4 g4 f4 e4   h0 g0 f0 e0}
 169
 170        vperm2f128      $0x13, \r1, \r5, \r6  # h6...a6
 171        vperm2f128      $0x02, \r1, \r5, \r2  # h2...a2
 172        vperm2f128      $0x13, \r3, \r7, \r5  # h5...a5
 173        vperm2f128      $0x02, \r3, \r7, \r1  # h1...a1
 174        vperm2f128      $0x13, \r0, \r4, \r7  # h7...a7
 175        vperm2f128      $0x02, \r0, \r4, \r3  # h3...a3
 176        vperm2f128      $0x13, \t0, \t1, \r4  # h4...a4
 177        vperm2f128      $0x02, \t0, \t1, \r0  # h0...a0
 178
 179.endm
 180
 181.macro ROTATE_ARGS
 182TMP_ = h
 183h = g
 184g = f
 185f = e
 186e = d
 187d = c
 188c = b
 189b = a
 190a = TMP_
 191.endm
 192
 193.macro _PRORD reg imm tmp
 194        vpslld  $(32-\imm),\reg,\tmp
 195        vpsrld  $\imm,\reg, \reg
 196        vpor    \tmp,\reg, \reg
 197.endm
 198
 199# PRORD_nd reg, imm, tmp, src
 200.macro _PRORD_nd reg imm tmp src
 201        vpslld  $(32-\imm), \src, \tmp
 202        vpsrld  $\imm, \src, \reg
 203        vpor    \tmp, \reg, \reg
 204.endm
 205
 206# PRORD dst/src, amt
 207.macro PRORD reg imm
 208        _PRORD  \reg,\imm,TMP
 209.endm
 210
 211# PRORD_nd dst, src, amt
 212.macro PRORD_nd reg tmp imm
 213        _PRORD_nd       \reg, \imm, TMP, \tmp
 214.endm
 215
 216# arguments passed implicitly in preprocessor symbols i, a...h
 217.macro ROUND_00_15 _T1 i
 218        PRORD_nd        a0,e,5  # sig1: a0 = (e >> 5)
 219
 220        vpxor   g, f, a2        # ch: a2 = f^g
 221        vpand   e,a2, a2        # ch: a2 = (f^g)&e
 222        vpxor   g, a2, a2       # a2 = ch
 223
 224        PRORD_nd        a1,e,25 # sig1: a1 = (e >> 25)
 225
 226        vmovdqu \_T1,(SZ8*(\i & 0xf))(%rsp)
 227        vpaddd  (TBL,ROUND,1), \_T1, \_T1       # T1 = W + K
 228        vpxor   e,a0, a0        # sig1: a0 = e ^ (e >> 5)
 229        PRORD   a0, 6           # sig1: a0 = (e >> 6) ^ (e >> 11)
 230        vpaddd  a2, h, h        # h = h + ch
 231        PRORD_nd        a2,a,11 # sig0: a2 = (a >> 11)
 232        vpaddd  \_T1,h, h       # h = h + ch + W + K
 233        vpxor   a1, a0, a0      # a0 = sigma1
 234        PRORD_nd        a1,a,22 # sig0: a1 = (a >> 22)
 235        vpxor   c, a, \_T1      # maj: T1 = a^c
 236        add     $SZ8, ROUND     # ROUND++
 237        vpand   b, \_T1, \_T1   # maj: T1 = (a^c)&b
 238        vpaddd  a0, h, h
 239        vpaddd  h, d, d
 240        vpxor   a, a2, a2       # sig0: a2 = a ^ (a >> 11)
 241        PRORD   a2,2            # sig0: a2 = (a >> 2) ^ (a >> 13)
 242        vpxor   a1, a2, a2      # a2 = sig0
 243        vpand   c, a, a1        # maj: a1 = a&c
 244        vpor    \_T1, a1, a1    # a1 = maj
 245        vpaddd  a1, h, h        # h = h + ch + W + K + maj
 246        vpaddd  a2, h, h        # h = h + ch + W + K + maj + sigma0
 247        ROTATE_ARGS
 248.endm
 249
 250# arguments passed implicitly in preprocessor symbols i, a...h
 251.macro ROUND_16_XX _T1 i
 252        vmovdqu (SZ8*((\i-15)&0xf))(%rsp), \_T1
 253        vmovdqu (SZ8*((\i-2)&0xf))(%rsp), a1
 254        vmovdqu \_T1, a0
 255        PRORD   \_T1,11
 256        vmovdqu a1, a2
 257        PRORD   a1,2
 258        vpxor   a0, \_T1, \_T1
 259        PRORD   \_T1, 7
 260        vpxor   a2, a1, a1
 261        PRORD   a1, 17
 262        vpsrld  $3, a0, a0
 263        vpxor   a0, \_T1, \_T1
 264        vpsrld  $10, a2, a2
 265        vpxor   a2, a1, a1
 266        vpaddd  (SZ8*((\i-16)&0xf))(%rsp), \_T1, \_T1
 267        vpaddd  (SZ8*((\i-7)&0xf))(%rsp), a1, a1
 268        vpaddd  a1, \_T1, \_T1
 269
 270        ROUND_00_15 \_T1,\i
 271.endm
 272
 273# SHA256_ARGS:
 274#   UINT128 digest[8];  // transposed digests
 275#   UINT8  *data_ptr[4];
 276
 277# void sha256_x8_avx2(SHA256_ARGS *args, UINT64 bytes);
 278# arg 1 : STATE : pointer to array of pointers to input data
 279# arg 2 : INP_SIZE  : size of input in blocks
 280        # general registers preserved in outer calling routine
 281        # outer calling routine saves all the XMM registers
 282        # save rsp, allocate 32-byte aligned for local variables
 283ENTRY(sha256_x8_avx2)
 284
 285        # save callee-saved clobbered registers to comply with C function ABI
 286        push    %r12
 287        push    %r13
 288        push    %r14
 289        push    %r15
 290
 291        mov     %rsp, IDX
 292        sub     $FRAMESZ, %rsp
 293        and     $~0x1F, %rsp
 294        mov     IDX, _rsp(%rsp)
 295
 296        # Load the pre-transposed incoming digest.
 297        vmovdqu 0*SHA256_DIGEST_ROW_SIZE(STATE),a
 298        vmovdqu 1*SHA256_DIGEST_ROW_SIZE(STATE),b
 299        vmovdqu 2*SHA256_DIGEST_ROW_SIZE(STATE),c
 300        vmovdqu 3*SHA256_DIGEST_ROW_SIZE(STATE),d
 301        vmovdqu 4*SHA256_DIGEST_ROW_SIZE(STATE),e
 302        vmovdqu 5*SHA256_DIGEST_ROW_SIZE(STATE),f
 303        vmovdqu 6*SHA256_DIGEST_ROW_SIZE(STATE),g
 304        vmovdqu 7*SHA256_DIGEST_ROW_SIZE(STATE),h
 305
 306        lea     K256_8(%rip),TBL
 307
 308        # load the address of each of the 4 message lanes
 309        # getting ready to transpose input onto stack
 310        mov     _args_data_ptr+0*PTR_SZ(STATE),inp0
 311        mov     _args_data_ptr+1*PTR_SZ(STATE),inp1
 312        mov     _args_data_ptr+2*PTR_SZ(STATE),inp2
 313        mov     _args_data_ptr+3*PTR_SZ(STATE),inp3
 314        mov     _args_data_ptr+4*PTR_SZ(STATE),inp4
 315        mov     _args_data_ptr+5*PTR_SZ(STATE),inp5
 316        mov     _args_data_ptr+6*PTR_SZ(STATE),inp6
 317        mov     _args_data_ptr+7*PTR_SZ(STATE),inp7
 318
 319        xor     IDX, IDX
 320lloop:
 321        xor     ROUND, ROUND
 322
 323        # save old digest
 324        vmovdqu a, _digest(%rsp)
 325        vmovdqu b, _digest+1*SZ8(%rsp)
 326        vmovdqu c, _digest+2*SZ8(%rsp)
 327        vmovdqu d, _digest+3*SZ8(%rsp)
 328        vmovdqu e, _digest+4*SZ8(%rsp)
 329        vmovdqu f, _digest+5*SZ8(%rsp)
 330        vmovdqu g, _digest+6*SZ8(%rsp)
 331        vmovdqu h, _digest+7*SZ8(%rsp)
 332        i = 0
 333.rep 2
 334        VMOVPS  i*32(inp0, IDX), TT0
 335        VMOVPS  i*32(inp1, IDX), TT1
 336        VMOVPS  i*32(inp2, IDX), TT2
 337        VMOVPS  i*32(inp3, IDX), TT3
 338        VMOVPS  i*32(inp4, IDX), TT4
 339        VMOVPS  i*32(inp5, IDX), TT5
 340        VMOVPS  i*32(inp6, IDX), TT6
 341        VMOVPS  i*32(inp7, IDX), TT7
 342        vmovdqu g, _ytmp(%rsp)
 343        vmovdqu h, _ytmp+1*SZ8(%rsp)
 344        TRANSPOSE8      TT0, TT1, TT2, TT3, TT4, TT5, TT6, TT7,   TMP0, TMP1
 345        vmovdqu PSHUFFLE_BYTE_FLIP_MASK(%rip), TMP1
 346        vmovdqu _ytmp(%rsp), g
 347        vpshufb TMP1, TT0, TT0
 348        vpshufb TMP1, TT1, TT1
 349        vpshufb TMP1, TT2, TT2
 350        vpshufb TMP1, TT3, TT3
 351        vpshufb TMP1, TT4, TT4
 352        vpshufb TMP1, TT5, TT5
 353        vpshufb TMP1, TT6, TT6
 354        vpshufb TMP1, TT7, TT7
 355        vmovdqu _ytmp+1*SZ8(%rsp), h
 356        vmovdqu TT4, _ytmp(%rsp)
 357        vmovdqu TT5, _ytmp+1*SZ8(%rsp)
 358        vmovdqu TT6, _ytmp+2*SZ8(%rsp)
 359        vmovdqu TT7, _ytmp+3*SZ8(%rsp)
 360        ROUND_00_15     TT0,(i*8+0)
 361        vmovdqu _ytmp(%rsp), TT0
 362        ROUND_00_15     TT1,(i*8+1)
 363        vmovdqu _ytmp+1*SZ8(%rsp), TT1
 364        ROUND_00_15     TT2,(i*8+2)
 365        vmovdqu _ytmp+2*SZ8(%rsp), TT2
 366        ROUND_00_15     TT3,(i*8+3)
 367        vmovdqu _ytmp+3*SZ8(%rsp), TT3
 368        ROUND_00_15     TT0,(i*8+4)
 369        ROUND_00_15     TT1,(i*8+5)
 370        ROUND_00_15     TT2,(i*8+6)
 371        ROUND_00_15     TT3,(i*8+7)
 372        i = (i+1)
 373.endr
 374        add     $64, IDX
 375        i = (i*8)
 376
 377        jmp     Lrounds_16_xx
 378.align 16
 379Lrounds_16_xx:
 380.rep 16
 381        ROUND_16_XX     T1, i
 382        i = (i+1)
 383.endr
 384
 385        cmp     $ROUNDS,ROUND
 386        jb      Lrounds_16_xx
 387
 388        # add old digest
 389        vpaddd  _digest+0*SZ8(%rsp), a, a
 390        vpaddd  _digest+1*SZ8(%rsp), b, b
 391        vpaddd  _digest+2*SZ8(%rsp), c, c
 392        vpaddd  _digest+3*SZ8(%rsp), d, d
 393        vpaddd  _digest+4*SZ8(%rsp), e, e
 394        vpaddd  _digest+5*SZ8(%rsp), f, f
 395        vpaddd  _digest+6*SZ8(%rsp), g, g
 396        vpaddd  _digest+7*SZ8(%rsp), h, h
 397
 398        sub     $1, INP_SIZE  # unit is blocks
 399        jne     lloop
 400
 401        # write back to memory (state object) the transposed digest
 402        vmovdqu a, 0*SHA256_DIGEST_ROW_SIZE(STATE)
 403        vmovdqu b, 1*SHA256_DIGEST_ROW_SIZE(STATE)
 404        vmovdqu c, 2*SHA256_DIGEST_ROW_SIZE(STATE)
 405        vmovdqu d, 3*SHA256_DIGEST_ROW_SIZE(STATE)
 406        vmovdqu e, 4*SHA256_DIGEST_ROW_SIZE(STATE)
 407        vmovdqu f, 5*SHA256_DIGEST_ROW_SIZE(STATE)
 408        vmovdqu g, 6*SHA256_DIGEST_ROW_SIZE(STATE)
 409        vmovdqu h, 7*SHA256_DIGEST_ROW_SIZE(STATE)
 410
 411        # update input pointers
 412        add     IDX, inp0
 413        mov     inp0, _args_data_ptr+0*8(STATE)
 414        add     IDX, inp1
 415        mov     inp1, _args_data_ptr+1*8(STATE)
 416        add     IDX, inp2
 417        mov     inp2, _args_data_ptr+2*8(STATE)
 418        add     IDX, inp3
 419        mov     inp3, _args_data_ptr+3*8(STATE)
 420        add     IDX, inp4
 421        mov     inp4, _args_data_ptr+4*8(STATE)
 422        add     IDX, inp5
 423        mov     inp5, _args_data_ptr+5*8(STATE)
 424        add     IDX, inp6
 425        mov     inp6, _args_data_ptr+6*8(STATE)
 426        add     IDX, inp7
 427        mov     inp7, _args_data_ptr+7*8(STATE)
 428
 429        # Postamble
 430        mov     _rsp(%rsp), %rsp
 431
 432        # restore callee-saved clobbered registers
 433        pop     %r15
 434        pop     %r14
 435        pop     %r13
 436        pop     %r12
 437
 438        ret
 439ENDPROC(sha256_x8_avx2)
 440
 441.section        .rodata.K256_8, "a", @progbits
 442.align 64
 443K256_8:
 444        .octa   0x428a2f98428a2f98428a2f98428a2f98
 445        .octa   0x428a2f98428a2f98428a2f98428a2f98
 446        .octa   0x71374491713744917137449171374491
 447        .octa   0x71374491713744917137449171374491
 448        .octa   0xb5c0fbcfb5c0fbcfb5c0fbcfb5c0fbcf
 449        .octa   0xb5c0fbcfb5c0fbcfb5c0fbcfb5c0fbcf
 450        .octa   0xe9b5dba5e9b5dba5e9b5dba5e9b5dba5
 451        .octa   0xe9b5dba5e9b5dba5e9b5dba5e9b5dba5
 452        .octa   0x3956c25b3956c25b3956c25b3956c25b
 453        .octa   0x3956c25b3956c25b3956c25b3956c25b
 454        .octa   0x59f111f159f111f159f111f159f111f1
 455        .octa   0x59f111f159f111f159f111f159f111f1
 456        .octa   0x923f82a4923f82a4923f82a4923f82a4
 457        .octa   0x923f82a4923f82a4923f82a4923f82a4
 458        .octa   0xab1c5ed5ab1c5ed5ab1c5ed5ab1c5ed5
 459        .octa   0xab1c5ed5ab1c5ed5ab1c5ed5ab1c5ed5
 460        .octa   0xd807aa98d807aa98d807aa98d807aa98
 461        .octa   0xd807aa98d807aa98d807aa98d807aa98
 462        .octa   0x12835b0112835b0112835b0112835b01
 463        .octa   0x12835b0112835b0112835b0112835b01
 464        .octa   0x243185be243185be243185be243185be
 465        .octa   0x243185be243185be243185be243185be
 466        .octa   0x550c7dc3550c7dc3550c7dc3550c7dc3
 467        .octa   0x550c7dc3550c7dc3550c7dc3550c7dc3
 468        .octa   0x72be5d7472be5d7472be5d7472be5d74
 469        .octa   0x72be5d7472be5d7472be5d7472be5d74
 470        .octa   0x80deb1fe80deb1fe80deb1fe80deb1fe
 471        .octa   0x80deb1fe80deb1fe80deb1fe80deb1fe
 472        .octa   0x9bdc06a79bdc06a79bdc06a79bdc06a7
 473        .octa   0x9bdc06a79bdc06a79bdc06a79bdc06a7
 474        .octa   0xc19bf174c19bf174c19bf174c19bf174
 475        .octa   0xc19bf174c19bf174c19bf174c19bf174
 476        .octa   0xe49b69c1e49b69c1e49b69c1e49b69c1
 477        .octa   0xe49b69c1e49b69c1e49b69c1e49b69c1
 478        .octa   0xefbe4786efbe4786efbe4786efbe4786
 479        .octa   0xefbe4786efbe4786efbe4786efbe4786
 480        .octa   0x0fc19dc60fc19dc60fc19dc60fc19dc6
 481        .octa   0x0fc19dc60fc19dc60fc19dc60fc19dc6
 482        .octa   0x240ca1cc240ca1cc240ca1cc240ca1cc
 483        .octa   0x240ca1cc240ca1cc240ca1cc240ca1cc
 484        .octa   0x2de92c6f2de92c6f2de92c6f2de92c6f
 485        .octa   0x2de92c6f2de92c6f2de92c6f2de92c6f
 486        .octa   0x4a7484aa4a7484aa4a7484aa4a7484aa
 487        .octa   0x4a7484aa4a7484aa4a7484aa4a7484aa
 488        .octa   0x5cb0a9dc5cb0a9dc5cb0a9dc5cb0a9dc
 489        .octa   0x5cb0a9dc5cb0a9dc5cb0a9dc5cb0a9dc
 490        .octa   0x76f988da76f988da76f988da76f988da
 491        .octa   0x76f988da76f988da76f988da76f988da
 492        .octa   0x983e5152983e5152983e5152983e5152
 493        .octa   0x983e5152983e5152983e5152983e5152
 494        .octa   0xa831c66da831c66da831c66da831c66d
 495        .octa   0xa831c66da831c66da831c66da831c66d
 496        .octa   0xb00327c8b00327c8b00327c8b00327c8
 497        .octa   0xb00327c8b00327c8b00327c8b00327c8
 498        .octa   0xbf597fc7bf597fc7bf597fc7bf597fc7
 499        .octa   0xbf597fc7bf597fc7bf597fc7bf597fc7
 500        .octa   0xc6e00bf3c6e00bf3c6e00bf3c6e00bf3
 501        .octa   0xc6e00bf3c6e00bf3c6e00bf3c6e00bf3
 502        .octa   0xd5a79147d5a79147d5a79147d5a79147
 503        .octa   0xd5a79147d5a79147d5a79147d5a79147
 504        .octa   0x06ca635106ca635106ca635106ca6351
 505        .octa   0x06ca635106ca635106ca635106ca6351
 506        .octa   0x14292967142929671429296714292967
 507        .octa   0x14292967142929671429296714292967
 508        .octa   0x27b70a8527b70a8527b70a8527b70a85
 509        .octa   0x27b70a8527b70a8527b70a8527b70a85
 510        .octa   0x2e1b21382e1b21382e1b21382e1b2138
 511        .octa   0x2e1b21382e1b21382e1b21382e1b2138
 512        .octa   0x4d2c6dfc4d2c6dfc4d2c6dfc4d2c6dfc
 513        .octa   0x4d2c6dfc4d2c6dfc4d2c6dfc4d2c6dfc
 514        .octa   0x53380d1353380d1353380d1353380d13
 515        .octa   0x53380d1353380d1353380d1353380d13
 516        .octa   0x650a7354650a7354650a7354650a7354
 517        .octa   0x650a7354650a7354650a7354650a7354
 518        .octa   0x766a0abb766a0abb766a0abb766a0abb
 519        .octa   0x766a0abb766a0abb766a0abb766a0abb
 520        .octa   0x81c2c92e81c2c92e81c2c92e81c2c92e
 521        .octa   0x81c2c92e81c2c92e81c2c92e81c2c92e
 522        .octa   0x92722c8592722c8592722c8592722c85
 523        .octa   0x92722c8592722c8592722c8592722c85
 524        .octa   0xa2bfe8a1a2bfe8a1a2bfe8a1a2bfe8a1
 525        .octa   0xa2bfe8a1a2bfe8a1a2bfe8a1a2bfe8a1
 526        .octa   0xa81a664ba81a664ba81a664ba81a664b
 527        .octa   0xa81a664ba81a664ba81a664ba81a664b
 528        .octa   0xc24b8b70c24b8b70c24b8b70c24b8b70
 529        .octa   0xc24b8b70c24b8b70c24b8b70c24b8b70
 530        .octa   0xc76c51a3c76c51a3c76c51a3c76c51a3
 531        .octa   0xc76c51a3c76c51a3c76c51a3c76c51a3
 532        .octa   0xd192e819d192e819d192e819d192e819
 533        .octa   0xd192e819d192e819d192e819d192e819
 534        .octa   0xd6990624d6990624d6990624d6990624
 535        .octa   0xd6990624d6990624d6990624d6990624
 536        .octa   0xf40e3585f40e3585f40e3585f40e3585
 537        .octa   0xf40e3585f40e3585f40e3585f40e3585
 538        .octa   0x106aa070106aa070106aa070106aa070
 539        .octa   0x106aa070106aa070106aa070106aa070
 540        .octa   0x19a4c11619a4c11619a4c11619a4c116
 541        .octa   0x19a4c11619a4c11619a4c11619a4c116
 542        .octa   0x1e376c081e376c081e376c081e376c08
 543        .octa   0x1e376c081e376c081e376c081e376c08
 544        .octa   0x2748774c2748774c2748774c2748774c
 545        .octa   0x2748774c2748774c2748774c2748774c
 546        .octa   0x34b0bcb534b0bcb534b0bcb534b0bcb5
 547        .octa   0x34b0bcb534b0bcb534b0bcb534b0bcb5
 548        .octa   0x391c0cb3391c0cb3391c0cb3391c0cb3
 549        .octa   0x391c0cb3391c0cb3391c0cb3391c0cb3
 550        .octa   0x4ed8aa4a4ed8aa4a4ed8aa4a4ed8aa4a
 551        .octa   0x4ed8aa4a4ed8aa4a4ed8aa4a4ed8aa4a
 552        .octa   0x5b9cca4f5b9cca4f5b9cca4f5b9cca4f
 553        .octa   0x5b9cca4f5b9cca4f5b9cca4f5b9cca4f
 554        .octa   0x682e6ff3682e6ff3682e6ff3682e6ff3
 555        .octa   0x682e6ff3682e6ff3682e6ff3682e6ff3
 556        .octa   0x748f82ee748f82ee748f82ee748f82ee
 557        .octa   0x748f82ee748f82ee748f82ee748f82ee
 558        .octa   0x78a5636f78a5636f78a5636f78a5636f
 559        .octa   0x78a5636f78a5636f78a5636f78a5636f
 560        .octa   0x84c8781484c8781484c8781484c87814
 561        .octa   0x84c8781484c8781484c8781484c87814
 562        .octa   0x8cc702088cc702088cc702088cc70208
 563        .octa   0x8cc702088cc702088cc702088cc70208
 564        .octa   0x90befffa90befffa90befffa90befffa
 565        .octa   0x90befffa90befffa90befffa90befffa
 566        .octa   0xa4506ceba4506ceba4506ceba4506ceb
 567        .octa   0xa4506ceba4506ceba4506ceba4506ceb
 568        .octa   0xbef9a3f7bef9a3f7bef9a3f7bef9a3f7
 569        .octa   0xbef9a3f7bef9a3f7bef9a3f7bef9a3f7
 570        .octa   0xc67178f2c67178f2c67178f2c67178f2
 571        .octa   0xc67178f2c67178f2c67178f2c67178f2
 572
 573.section        .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
 574.align 32
 575PSHUFFLE_BYTE_FLIP_MASK:
 576.octa 0x0c0d0e0f08090a0b0405060700010203
 577.octa 0x0c0d0e0f08090a0b0405060700010203
 578
 579.section        .rodata.cst256.K256, "aM", @progbits, 256
 580.align 64
 581.global K256
 582K256:
 583        .int    0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 584        .int    0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 585        .int    0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 586        .int    0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 587        .int    0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 588        .int    0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 589        .int    0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 590        .int    0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 591        .int    0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 592        .int    0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 593        .int    0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 594        .int    0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 595        .int    0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 596        .int    0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 597        .int    0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 598        .int    0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 599