linux/arch/x86/crypto/sha256-avx2-asm.S
<<
>>
Prefs
   1########################################################################
   2# Implement fast SHA-256 with AVX2 instructions. (x86_64)
   3#
   4# Copyright (C) 2013 Intel Corporation.
   5#
   6# Authors:
   7#     James Guilford <james.guilford@intel.com>
   8#     Kirk Yap <kirk.s.yap@intel.com>
   9#     Tim Chen <tim.c.chen@linux.intel.com>
  10#
  11# This software is available to you under a choice of one of two
  12# licenses.  You may choose to be licensed under the terms of the GNU
  13# General Public License (GPL) Version 2, available from the file
  14# COPYING in the main directory of this source tree, or the
  15# OpenIB.org BSD license below:
  16#
  17#     Redistribution and use in source and binary forms, with or
  18#     without modification, are permitted provided that the following
  19#     conditions are met:
  20#
  21#      - Redistributions of source code must retain the above
  22#        copyright notice, this list of conditions and the following
  23#        disclaimer.
  24#
  25#      - Redistributions in binary form must reproduce the above
  26#        copyright notice, this list of conditions and the following
  27#        disclaimer in the documentation and/or other materials
  28#        provided with the distribution.
  29#
  30# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  31# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  32# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  33# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  34# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  35# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  36# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  37# SOFTWARE.
  38#
  39########################################################################
  40#
  41# This code is described in an Intel White-Paper:
  42# "Fast SHA-256 Implementations on Intel Architecture Processors"
  43#
  44# To find it, surf to http://www.intel.com/p/en_US/embedded
  45# and search for that title.
  46#
  47########################################################################
  48# This code schedules 2 blocks at a time, with 4 lanes per block
  49########################################################################
  50
  51#ifdef CONFIG_AS_AVX2
  52#include <linux/linkage.h>
  53
  54## assume buffers not aligned
  55#define VMOVDQ vmovdqu
  56
  57################################ Define Macros
  58
  59# addm [mem], reg
  60# Add reg to mem using reg-mem add and store
  61.macro addm p1 p2
  62        add     \p1, \p2
  63        mov     \p2, \p1
  64.endm
  65
  66################################
  67
  68X0 = %ymm4
  69X1 = %ymm5
  70X2 = %ymm6
  71X3 = %ymm7
  72
  73# XMM versions of above
  74XWORD0 = %xmm4
  75XWORD1 = %xmm5
  76XWORD2 = %xmm6
  77XWORD3 = %xmm7
  78
  79XTMP0 = %ymm0
  80XTMP1 = %ymm1
  81XTMP2 = %ymm2
  82XTMP3 = %ymm3
  83XTMP4 = %ymm8
  84XFER  = %ymm9
  85XTMP5 = %ymm11
  86
  87SHUF_00BA =     %ymm10 # shuffle xBxA -> 00BA
  88SHUF_DC00 =     %ymm12 # shuffle xDxC -> DC00
  89BYTE_FLIP_MASK = %ymm13
  90
  91X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
  92
  93NUM_BLKS = %rdx # 3rd arg
  94INP     = %rsi  # 2nd arg
  95CTX     = %rdi  # 1st arg
  96c       = %ecx
  97d       = %r8d
  98e       = %edx  # clobbers NUM_BLKS
  99y3      = %esi  # clobbers INP
 100
 101
 102TBL     = %rbp
 103SRND    = CTX   # SRND is same register as CTX
 104
 105a = %eax
 106b = %ebx
 107f = %r9d
 108g = %r10d
 109h = %r11d
 110old_h = %r11d
 111
 112T1 = %r12d
 113y0 = %r13d
 114y1 = %r14d
 115y2 = %r15d
 116
 117
 118_XFER_SIZE      = 2*64*4        # 2 blocks, 64 rounds, 4 bytes/round
 119_XMM_SAVE_SIZE  = 0
 120_INP_END_SIZE   = 8
 121_INP_SIZE       = 8
 122_CTX_SIZE       = 8
 123_RSP_SIZE       = 8
 124
 125_XFER           = 0
 126_XMM_SAVE       = _XFER     + _XFER_SIZE
 127_INP_END        = _XMM_SAVE + _XMM_SAVE_SIZE
 128_INP            = _INP_END  + _INP_END_SIZE
 129_CTX            = _INP      + _INP_SIZE
 130_RSP            = _CTX      + _CTX_SIZE
 131STACK_SIZE      = _RSP      + _RSP_SIZE
 132
 133# rotate_Xs
 134# Rotate values of symbols X0...X3
 135.macro rotate_Xs
 136        X_ = X0
 137        X0 = X1
 138        X1 = X2
 139        X2 = X3
 140        X3 = X_
 141.endm
 142
 143# ROTATE_ARGS
 144# Rotate values of symbols a...h
 145.macro ROTATE_ARGS
 146        old_h = h
 147        TMP_ = h
 148        h = g
 149        g = f
 150        f = e
 151        e = d
 152        d = c
 153        c = b
 154        b = a
 155        a = TMP_
 156.endm
 157
 158.macro FOUR_ROUNDS_AND_SCHED disp
 159################################### RND N + 0 ############################
 160
 161        mov     a, y3           # y3 = a                                # MAJA
 162        rorx    $25, e, y0      # y0 = e >> 25                          # S1A
 163        rorx    $11, e, y1      # y1 = e >> 11                          # S1B
 164
 165        addl    \disp(%rsp, SRND), h            # h = k + w + h         # --
 166        or      c, y3           # y3 = a|c                              # MAJA
 167        vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
 168        mov     f, y2           # y2 = f                                # CH
 169        rorx    $13, a, T1      # T1 = a >> 13                          # S0B
 170
 171        xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
 172        xor     g, y2           # y2 = f^g                              # CH
 173        vpaddd  X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
 174        rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
 175
 176        and     e, y2           # y2 = (f^g)&e                          # CH
 177        xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
 178        rorx    $22, a, y1      # y1 = a >> 22                          # S0A
 179        add     h, d            # d = k + w + h + d                     # --
 180
 181        and     b, y3           # y3 = (a|c)&b                          # MAJA
 182        vpalignr $4, X0, X1, XTMP1      # XTMP1 = W[-15]
 183        xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
 184        rorx    $2, a, T1       # T1 = (a >> 2)                         # S0
 185
 186        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
 187        vpsrld  $7, XTMP1, XTMP2
 188        xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
 189        mov     a, T1           # T1 = a                                # MAJB
 190        and     c, T1           # T1 = a&c                              # MAJB
 191
 192        add     y0, y2          # y2 = S1 + CH                          # --
 193        vpslld  $(32-7), XTMP1, XTMP3
 194        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
 195        add     y1, h           # h = k + w + h + S0                    # --
 196
 197        add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
 198        vpor    XTMP2, XTMP3, XTMP3     # XTMP3 = W[-15] ror 7
 199
 200        vpsrld  $18, XTMP1, XTMP2
 201        add     y2, h           # h = k + w + h + S0 + S1 + CH = t1 + S0# --
 202        add     y3, h           # h = t1 + S0 + MAJ                     # --
 203
 204
 205        ROTATE_ARGS
 206
 207################################### RND N + 1 ############################
 208
 209        mov     a, y3           # y3 = a                                # MAJA
 210        rorx    $25, e, y0      # y0 = e >> 25                          # S1A
 211        rorx    $11, e, y1      # y1 = e >> 11                          # S1B
 212        offset = \disp + 1*4
 213        addl    offset(%rsp, SRND), h   # h = k + w + h         # --
 214        or      c, y3           # y3 = a|c                              # MAJA
 215
 216
 217        vpsrld  $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
 218        mov     f, y2           # y2 = f                                # CH
 219        rorx    $13, a, T1      # T1 = a >> 13                          # S0B
 220        xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
 221        xor     g, y2           # y2 = f^g                              # CH
 222
 223
 224        rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
 225        xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
 226        rorx    $22, a, y1      # y1 = a >> 22                          # S0A
 227        and     e, y2           # y2 = (f^g)&e                          # CH
 228        add     h, d            # d = k + w + h + d                     # --
 229
 230        vpslld  $(32-18), XTMP1, XTMP1
 231        and     b, y3           # y3 = (a|c)&b                          # MAJA
 232        xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
 233
 234        vpxor   XTMP1, XTMP3, XTMP3
 235        rorx    $2, a, T1       # T1 = (a >> 2)                         # S0
 236        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
 237
 238        vpxor   XTMP2, XTMP3, XTMP3     # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
 239        xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
 240        mov     a, T1           # T1 = a                                # MAJB
 241        and     c, T1           # T1 = a&c                              # MAJB
 242        add     y0, y2          # y2 = S1 + CH                          # --
 243
 244        vpxor   XTMP4, XTMP3, XTMP1     # XTMP1 = s0
 245        vpshufd $0b11111010, X3, XTMP2  # XTMP2 = W[-2] {BBAA}
 246        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
 247        add     y1, h           # h = k + w + h + S0                    # --
 248
 249        vpaddd  XTMP1, XTMP0, XTMP0     # XTMP0 = W[-16] + W[-7] + s0
 250        add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
 251        add     y2, h           # h = k + w + h + S0 + S1 + CH = t1 + S0# --
 252        add     y3, h           # h = t1 + S0 + MAJ                     # --
 253
 254        vpsrld  $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
 255
 256
 257        ROTATE_ARGS
 258
 259################################### RND N + 2 ############################
 260
 261        mov     a, y3           # y3 = a                                # MAJA
 262        rorx    $25, e, y0      # y0 = e >> 25                          # S1A
 263        offset = \disp + 2*4
 264        addl    offset(%rsp, SRND), h   # h = k + w + h         # --
 265
 266        vpsrlq  $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
 267        rorx    $11, e, y1      # y1 = e >> 11                          # S1B
 268        or      c, y3           # y3 = a|c                              # MAJA
 269        mov     f, y2           # y2 = f                                # CH
 270        xor     g, y2           # y2 = f^g                              # CH
 271
 272        rorx    $13, a, T1      # T1 = a >> 13                          # S0B
 273        xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
 274        vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] ror 17 {xBxA}
 275        and     e, y2           # y2 = (f^g)&e                          # CH
 276
 277        rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
 278        vpxor   XTMP3, XTMP2, XTMP2
 279        add     h, d            # d = k + w + h + d                     # --
 280        and     b, y3           # y3 = (a|c)&b                          # MAJA
 281
 282        xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
 283        rorx    $22, a, y1      # y1 = a >> 22                          # S0A
 284        vpxor   XTMP2, XTMP4, XTMP4     # XTMP4 = s1 {xBxA}
 285        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
 286
 287        vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
 288        xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
 289        rorx    $2, a ,T1       # T1 = (a >> 2)                         # S0
 290        vpaddd  XTMP4, XTMP0, XTMP0     # XTMP0 = {..., ..., W[1], W[0]}
 291
 292        xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
 293        mov     a, T1           # T1 = a                                # MAJB
 294        and     c, T1           # T1 = a&c                              # MAJB
 295        add     y0, y2          # y2 = S1 + CH                          # --
 296        vpshufd $0b01010000, XTMP0, XTMP2       # XTMP2 = W[-2] {DDCC}
 297
 298        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
 299        add     y1,h            # h = k + w + h + S0                    # --
 300        add     y2,d            # d = k + w + h + d + S1 + CH = d + t1  # --
 301        add     y2,h            # h = k + w + h + S0 + S1 + CH = t1 + S0# --
 302
 303        add     y3,h            # h = t1 + S0 + MAJ                     # --
 304
 305
 306        ROTATE_ARGS
 307
 308################################### RND N + 3 ############################
 309
 310        mov     a, y3           # y3 = a                                # MAJA
 311        rorx    $25, e, y0      # y0 = e >> 25                          # S1A
 312        rorx    $11, e, y1      # y1 = e >> 11                          # S1B
 313        offset = \disp + 3*4
 314        addl    offset(%rsp, SRND), h   # h = k + w + h         # --
 315        or      c, y3           # y3 = a|c                              # MAJA
 316
 317
 318        vpsrld  $10, XTMP2, XTMP5       # XTMP5 = W[-2] >> 10 {DDCC}
 319        mov     f, y2           # y2 = f                                # CH
 320        rorx    $13, a, T1      # T1 = a >> 13                          # S0B
 321        xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
 322        xor     g, y2           # y2 = f^g                              # CH
 323
 324
 325        vpsrlq  $19, XTMP2, XTMP3       # XTMP3 = W[-2] ror 19 {xDxC}
 326        rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
 327        and     e, y2           # y2 = (f^g)&e                          # CH
 328        add     h, d            # d = k + w + h + d                     # --
 329        and     b, y3           # y3 = (a|c)&b                          # MAJA
 330
 331        vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] ror 17 {xDxC}
 332        xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
 333        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
 334
 335        vpxor   XTMP3, XTMP2, XTMP2
 336        rorx    $22, a, y1      # y1 = a >> 22                          # S0A
 337        add     y0, y2          # y2 = S1 + CH                          # --
 338
 339        vpxor   XTMP2, XTMP5, XTMP5     # XTMP5 = s1 {xDxC}
 340        xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
 341        add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
 342
 343        rorx    $2, a, T1       # T1 = (a >> 2)                         # S0
 344        vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
 345
 346        vpaddd  XTMP0, XTMP5, X0        # X0 = {W[3], W[2], W[1], W[0]}
 347        xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
 348        mov     a, T1           # T1 = a                                # MAJB
 349        and     c, T1           # T1 = a&c                              # MAJB
 350        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
 351
 352        add     y1, h           # h = k + w + h + S0                    # --
 353        add     y2, h           # h = k + w + h + S0 + S1 + CH = t1 + S0# --
 354        add     y3, h           # h = t1 + S0 + MAJ                     # --
 355
 356        ROTATE_ARGS
 357        rotate_Xs
 358.endm
 359
 360.macro DO_4ROUNDS disp
 361################################### RND N + 0 ###########################
 362
 363        mov     f, y2           # y2 = f                                # CH
 364        rorx    $25, e, y0      # y0 = e >> 25                          # S1A
 365        rorx    $11, e, y1      # y1 = e >> 11                          # S1B
 366        xor     g, y2           # y2 = f^g                              # CH
 367
 368        xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
 369        rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
 370        and     e, y2           # y2 = (f^g)&e                          # CH
 371
 372        xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
 373        rorx    $13, a, T1      # T1 = a >> 13                          # S0B
 374        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
 375        rorx    $22, a, y1      # y1 = a >> 22                          # S0A
 376        mov     a, y3           # y3 = a                                # MAJA
 377
 378        xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
 379        rorx    $2, a, T1       # T1 = (a >> 2)                         # S0
 380        addl    \disp(%rsp, SRND), h            # h = k + w + h # --
 381        or      c, y3           # y3 = a|c                              # MAJA
 382
 383        xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
 384        mov     a, T1           # T1 = a                                # MAJB
 385        and     b, y3           # y3 = (a|c)&b                          # MAJA
 386        and     c, T1           # T1 = a&c                              # MAJB
 387        add     y0, y2          # y2 = S1 + CH                          # --
 388
 389
 390        add     h, d            # d = k + w + h + d                     # --
 391        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
 392        add     y1, h           # h = k + w + h + S0                    # --
 393        add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
 394
 395        ROTATE_ARGS
 396
 397################################### RND N + 1 ###########################
 398
 399        add     y2, old_h       # h = k + w + h + S0 + S1 + CH = t1 + S0# --
 400        mov     f, y2           # y2 = f                                # CH
 401        rorx    $25, e, y0      # y0 = e >> 25                          # S1A
 402        rorx    $11, e, y1      # y1 = e >> 11                          # S1B
 403        xor     g, y2           # y2 = f^g                              # CH
 404
 405        xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
 406        rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
 407        and     e, y2           # y2 = (f^g)&e                          # CH
 408        add     y3, old_h       # h = t1 + S0 + MAJ                     # --
 409
 410        xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
 411        rorx    $13, a, T1      # T1 = a >> 13                          # S0B
 412        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
 413        rorx    $22, a, y1      # y1 = a >> 22                          # S0A
 414        mov     a, y3           # y3 = a                                # MAJA
 415
 416        xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
 417        rorx    $2, a, T1       # T1 = (a >> 2)                         # S0
 418        offset = 4*1 + \disp
 419        addl    offset(%rsp, SRND), h           # h = k + w + h # --
 420        or      c, y3           # y3 = a|c                              # MAJA
 421
 422        xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
 423        mov     a, T1           # T1 = a                                # MAJB
 424        and     b, y3           # y3 = (a|c)&b                          # MAJA
 425        and     c, T1           # T1 = a&c                              # MAJB
 426        add     y0, y2          # y2 = S1 + CH                          # --
 427
 428
 429        add     h, d            # d = k + w + h + d                     # --
 430        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
 431        add     y1, h           # h = k + w + h + S0                    # --
 432
 433        add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
 434
 435        ROTATE_ARGS
 436
 437################################### RND N + 2 ##############################
 438
 439        add     y2, old_h       # h = k + w + h + S0 + S1 + CH = t1 + S0# --
 440        mov     f, y2           # y2 = f                                # CH
 441        rorx    $25, e, y0      # y0 = e >> 25                          # S1A
 442        rorx    $11, e, y1      # y1 = e >> 11                          # S1B
 443        xor     g, y2           # y2 = f^g                              # CH
 444
 445        xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
 446        rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
 447        and     e, y2           # y2 = (f^g)&e                          # CH
 448        add     y3, old_h       # h = t1 + S0 + MAJ                     # --
 449
 450        xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
 451        rorx    $13, a, T1      # T1 = a >> 13                          # S0B
 452        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
 453        rorx    $22, a, y1      # y1 = a >> 22                          # S0A
 454        mov     a, y3           # y3 = a                                # MAJA
 455
 456        xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
 457        rorx    $2, a, T1       # T1 = (a >> 2)                         # S0
 458        offset = 4*2 + \disp
 459        addl    offset(%rsp, SRND), h           # h = k + w + h # --
 460        or      c, y3           # y3 = a|c                              # MAJA
 461
 462        xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
 463        mov     a, T1           # T1 = a                                # MAJB
 464        and     b, y3           # y3 = (a|c)&b                          # MAJA
 465        and     c, T1           # T1 = a&c                              # MAJB
 466        add     y0, y2          # y2 = S1 + CH                          # --
 467
 468
 469        add     h, d            # d = k + w + h + d                     # --
 470        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
 471        add     y1, h           # h = k + w + h + S0                    # --
 472
 473        add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
 474
 475        ROTATE_ARGS
 476
 477################################### RND N + 3 ###########################
 478
 479        add     y2, old_h       # h = k + w + h + S0 + S1 + CH = t1 + S0# --
 480        mov     f, y2           # y2 = f                                # CH
 481        rorx    $25, e, y0      # y0 = e >> 25                          # S1A
 482        rorx    $11, e, y1      # y1 = e >> 11                          # S1B
 483        xor     g, y2           # y2 = f^g                              # CH
 484
 485        xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
 486        rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
 487        and     e, y2           # y2 = (f^g)&e                          # CH
 488        add     y3, old_h       # h = t1 + S0 + MAJ                     # --
 489
 490        xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
 491        rorx    $13, a, T1      # T1 = a >> 13                          # S0B
 492        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
 493        rorx    $22, a, y1      # y1 = a >> 22                          # S0A
 494        mov     a, y3           # y3 = a                                # MAJA
 495
 496        xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
 497        rorx    $2, a, T1       # T1 = (a >> 2)                         # S0
 498        offset = 4*3 + \disp
 499        addl    offset(%rsp, SRND), h           # h = k + w + h # --
 500        or      c, y3           # y3 = a|c                              # MAJA
 501
 502        xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
 503        mov     a, T1           # T1 = a                                # MAJB
 504        and     b, y3           # y3 = (a|c)&b                          # MAJA
 505        and     c, T1           # T1 = a&c                              # MAJB
 506        add     y0, y2          # y2 = S1 + CH                          # --
 507
 508
 509        add     h, d            # d = k + w + h + d                     # --
 510        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
 511        add     y1, h           # h = k + w + h + S0                    # --
 512
 513        add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
 514
 515
 516        add     y2, h           # h = k + w + h + S0 + S1 + CH = t1 + S0# --
 517
 518        add     y3, h           # h = t1 + S0 + MAJ                     # --
 519
 520        ROTATE_ARGS
 521
 522.endm
 523
 524########################################################################
 525## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks)
 526## arg 1 : pointer to digest
 527## arg 2 : pointer to input data
 528## arg 3 : Num blocks
 529########################################################################
 530.text
 531ENTRY(sha256_transform_rorx)
 532.align 32
 533        pushq   %rbx
 534        pushq   %rbp
 535        pushq   %r12
 536        pushq   %r13
 537        pushq   %r14
 538        pushq   %r15
 539
 540        mov     %rsp, %rax
 541        subq    $STACK_SIZE, %rsp
 542        and     $-32, %rsp      # align rsp to 32 byte boundary
 543        mov     %rax, _RSP(%rsp)
 544
 545
 546        shl     $6, NUM_BLKS    # convert to bytes
 547        jz      done_hash
 548        lea     -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
 549        mov     NUM_BLKS, _INP_END(%rsp)
 550
 551        cmp     NUM_BLKS, INP
 552        je      only_one_block
 553
 554        ## load initial digest
 555        mov     (CTX), a
 556        mov     4*1(CTX), b
 557        mov     4*2(CTX), c
 558        mov     4*3(CTX), d
 559        mov     4*4(CTX), e
 560        mov     4*5(CTX), f
 561        mov     4*6(CTX), g
 562        mov     4*7(CTX), h
 563
 564        vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
 565        vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
 566        vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
 567
 568        mov     CTX, _CTX(%rsp)
 569
 570loop0:
 571        lea     K256(%rip), TBL
 572
 573        ## Load first 16 dwords from two blocks
 574        VMOVDQ  0*32(INP),XTMP0
 575        VMOVDQ  1*32(INP),XTMP1
 576        VMOVDQ  2*32(INP),XTMP2
 577        VMOVDQ  3*32(INP),XTMP3
 578
 579        ## byte swap data
 580        vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0
 581        vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1
 582        vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2
 583        vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3
 584
 585        ## transpose data into high/low halves
 586        vperm2i128      $0x20, XTMP2, XTMP0, X0
 587        vperm2i128      $0x31, XTMP2, XTMP0, X1
 588        vperm2i128      $0x20, XTMP3, XTMP1, X2
 589        vperm2i128      $0x31, XTMP3, XTMP1, X3
 590
 591last_block_enter:
 592        add     $64, INP
 593        mov     INP, _INP(%rsp)
 594
 595        ## schedule 48 input dwords, by doing 3 rounds of 12 each
 596        xor     SRND, SRND
 597
 598.align 16
 599loop1:
 600        vpaddd  0*32(TBL, SRND), X0, XFER
 601        vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
 602        FOUR_ROUNDS_AND_SCHED   _XFER + 0*32
 603
 604        vpaddd  1*32(TBL, SRND), X0, XFER
 605        vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
 606        FOUR_ROUNDS_AND_SCHED   _XFER + 1*32
 607
 608        vpaddd  2*32(TBL, SRND), X0, XFER
 609        vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
 610        FOUR_ROUNDS_AND_SCHED   _XFER + 2*32
 611
 612        vpaddd  3*32(TBL, SRND), X0, XFER
 613        vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
 614        FOUR_ROUNDS_AND_SCHED   _XFER + 3*32
 615
 616        add     $4*32, SRND
 617        cmp     $3*4*32, SRND
 618        jb      loop1
 619
 620loop2:
 621        ## Do last 16 rounds with no scheduling
 622        vpaddd  0*32(TBL, SRND), X0, XFER
 623        vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
 624        DO_4ROUNDS      _XFER + 0*32
 625        vpaddd  1*32(TBL, SRND), X1, XFER
 626        vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
 627        DO_4ROUNDS      _XFER + 1*32
 628        add     $2*32, SRND
 629
 630        vmovdqa X2, X0
 631        vmovdqa X3, X1
 632
 633        cmp     $4*4*32, SRND
 634        jb      loop2
 635
 636        mov     _CTX(%rsp), CTX
 637        mov     _INP(%rsp), INP
 638
 639        addm    (4*0)(CTX),a
 640        addm    (4*1)(CTX),b
 641        addm    (4*2)(CTX),c
 642        addm    (4*3)(CTX),d
 643        addm    (4*4)(CTX),e
 644        addm    (4*5)(CTX),f
 645        addm    (4*6)(CTX),g
 646        addm    (4*7)(CTX),h
 647
 648        cmp     _INP_END(%rsp), INP
 649        ja      done_hash
 650
 651        #### Do second block using previously scheduled results
 652        xor     SRND, SRND
 653.align 16
 654loop3:
 655        DO_4ROUNDS       _XFER + 0*32 + 16
 656        DO_4ROUNDS       _XFER + 1*32 + 16
 657        add     $2*32, SRND
 658        cmp     $4*4*32, SRND
 659        jb      loop3
 660
 661        mov     _CTX(%rsp), CTX
 662        mov     _INP(%rsp), INP
 663        add     $64, INP
 664
 665        addm    (4*0)(CTX),a
 666        addm    (4*1)(CTX),b
 667        addm    (4*2)(CTX),c
 668        addm    (4*3)(CTX),d
 669        addm    (4*4)(CTX),e
 670        addm    (4*5)(CTX),f
 671        addm    (4*6)(CTX),g
 672        addm    (4*7)(CTX),h
 673
 674        cmp     _INP_END(%rsp), INP
 675        jb      loop0
 676        ja      done_hash
 677
 678do_last_block:
 679        #### do last block
 680        lea     K256(%rip), TBL
 681
 682        VMOVDQ  0*16(INP),XWORD0
 683        VMOVDQ  1*16(INP),XWORD1
 684        VMOVDQ  2*16(INP),XWORD2
 685        VMOVDQ  3*16(INP),XWORD3
 686
 687        vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0
 688        vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1
 689        vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
 690        vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
 691
 692        jmp     last_block_enter
 693
 694only_one_block:
 695
 696        ## load initial digest
 697        mov     (4*0)(CTX),a
 698        mov     (4*1)(CTX),b
 699        mov     (4*2)(CTX),c
 700        mov     (4*3)(CTX),d
 701        mov     (4*4)(CTX),e
 702        mov     (4*5)(CTX),f
 703        mov     (4*6)(CTX),g
 704        mov     (4*7)(CTX),h
 705
 706        vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
 707        vmovdqa _SHUF_00BA(%rip), SHUF_00BA
 708        vmovdqa _SHUF_DC00(%rip), SHUF_DC00
 709
 710        mov     CTX, _CTX(%rsp)
 711        jmp     do_last_block
 712
 713done_hash:
 714
 715        mov     _RSP(%rsp), %rsp
 716
 717        popq    %r15
 718        popq    %r14
 719        popq    %r13
 720        popq    %r12
 721        popq    %rbp
 722        popq    %rbx
 723        ret
 724ENDPROC(sha256_transform_rorx)
 725
 726.data
 727.align 64
 728K256:
 729        .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 730        .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 731        .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 732        .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 733        .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 734        .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 735        .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 736        .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 737        .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 738        .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 739        .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 740        .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 741        .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 742        .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 743        .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 744        .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 745        .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 746        .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 747        .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 748        .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 749        .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 750        .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 751        .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 752        .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 753        .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 754        .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 755        .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 756        .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 757        .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 758        .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 759        .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 760        .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 761
 762PSHUFFLE_BYTE_FLIP_MASK:
 763        .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
 764
 765# shuffle xBxA -> 00BA
 766_SHUF_00BA:
 767        .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
 768
 769# shuffle xDxC -> DC00
 770_SHUF_DC00:
 771        .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
 772#endif
 773