linux/arch/x86/crypto/sha256-avx2-asm.S
<<
>>
Prefs
   1########################################################################
   2# Implement fast SHA-256 with AVX2 instructions. (x86_64)
   3#
   4# Copyright (C) 2013 Intel Corporation.
   5#
   6# Authors:
   7#     James Guilford <james.guilford@intel.com>
   8#     Kirk Yap <kirk.s.yap@intel.com>
   9#     Tim Chen <tim.c.chen@linux.intel.com>
  10#
  11# This software is available to you under a choice of one of two
  12# licenses.  You may choose to be licensed under the terms of the GNU
  13# General Public License (GPL) Version 2, available from the file
  14# COPYING in the main directory of this source tree, or the
  15# OpenIB.org BSD license below:
  16#
  17#     Redistribution and use in source and binary forms, with or
  18#     without modification, are permitted provided that the following
  19#     conditions are met:
  20#
  21#      - Redistributions of source code must retain the above
  22#        copyright notice, this list of conditions and the following
  23#        disclaimer.
  24#
  25#      - Redistributions in binary form must reproduce the above
  26#        copyright notice, this list of conditions and the following
  27#        disclaimer in the documentation and/or other materials
  28#        provided with the distribution.
  29#
  30# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  31# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  32# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  33# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  34# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  35# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  36# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  37# SOFTWARE.
  38#
  39########################################################################
  40#
  41# This code is described in an Intel White-Paper:
  42# "Fast SHA-256 Implementations on Intel Architecture Processors"
  43#
  44# To find it, surf to http://www.intel.com/p/en_US/embedded
  45# and search for that title.
  46#
  47########################################################################
  48# This code schedules 2 blocks at a time, with 4 lanes per block
  49########################################################################
  50
  51#ifdef CONFIG_AS_AVX2
  52#include <linux/linkage.h>
  53
  54## assume buffers not aligned
  55#define VMOVDQ vmovdqu
  56
  57################################ Define Macros
  58
  59# addm [mem], reg
  60# Add reg to mem using reg-mem add and store
  61.macro addm p1 p2
  62        add     \p1, \p2
  63        mov     \p2, \p1
  64.endm
  65
  66################################
  67
  68X0 = %ymm4
  69X1 = %ymm5
  70X2 = %ymm6
  71X3 = %ymm7
  72
  73# XMM versions of above
  74XWORD0 = %xmm4
  75XWORD1 = %xmm5
  76XWORD2 = %xmm6
  77XWORD3 = %xmm7
  78
  79XTMP0 = %ymm0
  80XTMP1 = %ymm1
  81XTMP2 = %ymm2
  82XTMP3 = %ymm3
  83XTMP4 = %ymm8
  84XFER  = %ymm9
  85XTMP5 = %ymm11
  86
  87SHUF_00BA =     %ymm10 # shuffle xBxA -> 00BA
  88SHUF_DC00 =     %ymm12 # shuffle xDxC -> DC00
  89BYTE_FLIP_MASK = %ymm13
  90
  91X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
  92
  93NUM_BLKS = %rdx # 3rd arg
  94INP     = %rsi  # 2nd arg
  95CTX     = %rdi  # 1st arg
  96c       = %ecx
  97d       = %r8d
  98e       = %edx  # clobbers NUM_BLKS
  99y3      = %esi  # clobbers INP
 100
 101SRND    = CTX   # SRND is same register as CTX
 102
 103a = %eax
 104b = %ebx
 105f = %r9d
 106g = %r10d
 107h = %r11d
 108old_h = %r11d
 109
 110T1 = %r12d
 111y0 = %r13d
 112y1 = %r14d
 113y2 = %r15d
 114
 115
 116_XFER_SIZE      = 2*64*4        # 2 blocks, 64 rounds, 4 bytes/round
 117_XMM_SAVE_SIZE  = 0
 118_INP_END_SIZE   = 8
 119_INP_SIZE       = 8
 120_CTX_SIZE       = 8
 121_RSP_SIZE       = 8
 122
 123_XFER           = 0
 124_XMM_SAVE       = _XFER     + _XFER_SIZE
 125_INP_END        = _XMM_SAVE + _XMM_SAVE_SIZE
 126_INP            = _INP_END  + _INP_END_SIZE
 127_CTX            = _INP      + _INP_SIZE
 128_RSP            = _CTX      + _CTX_SIZE
 129STACK_SIZE      = _RSP      + _RSP_SIZE
 130
 131# rotate_Xs
 132# Rotate values of symbols X0...X3
 133.macro rotate_Xs
 134        X_ = X0
 135        X0 = X1
 136        X1 = X2
 137        X2 = X3
 138        X3 = X_
 139.endm
 140
 141# ROTATE_ARGS
 142# Rotate values of symbols a...h
 143.macro ROTATE_ARGS
 144        old_h = h
 145        TMP_ = h
 146        h = g
 147        g = f
 148        f = e
 149        e = d
 150        d = c
 151        c = b
 152        b = a
 153        a = TMP_
 154.endm
 155
 156.macro FOUR_ROUNDS_AND_SCHED disp
 157################################### RND N + 0 ############################
 158
 159        mov     a, y3           # y3 = a                                # MAJA
 160        rorx    $25, e, y0      # y0 = e >> 25                          # S1A
 161        rorx    $11, e, y1      # y1 = e >> 11                          # S1B
 162
 163        addl    \disp(%rsp, SRND), h            # h = k + w + h         # --
 164        or      c, y3           # y3 = a|c                              # MAJA
 165        vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
 166        mov     f, y2           # y2 = f                                # CH
 167        rorx    $13, a, T1      # T1 = a >> 13                          # S0B
 168
 169        xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
 170        xor     g, y2           # y2 = f^g                              # CH
 171        vpaddd  X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
 172        rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
 173
 174        and     e, y2           # y2 = (f^g)&e                          # CH
 175        xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
 176        rorx    $22, a, y1      # y1 = a >> 22                          # S0A
 177        add     h, d            # d = k + w + h + d                     # --
 178
 179        and     b, y3           # y3 = (a|c)&b                          # MAJA
 180        vpalignr $4, X0, X1, XTMP1      # XTMP1 = W[-15]
 181        xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
 182        rorx    $2, a, T1       # T1 = (a >> 2)                         # S0
 183
 184        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
 185        vpsrld  $7, XTMP1, XTMP2
 186        xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
 187        mov     a, T1           # T1 = a                                # MAJB
 188        and     c, T1           # T1 = a&c                              # MAJB
 189
 190        add     y0, y2          # y2 = S1 + CH                          # --
 191        vpslld  $(32-7), XTMP1, XTMP3
 192        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
 193        add     y1, h           # h = k + w + h + S0                    # --
 194
 195        add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
 196        vpor    XTMP2, XTMP3, XTMP3     # XTMP3 = W[-15] ror 7
 197
 198        vpsrld  $18, XTMP1, XTMP2
 199        add     y2, h           # h = k + w + h + S0 + S1 + CH = t1 + S0# --
 200        add     y3, h           # h = t1 + S0 + MAJ                     # --
 201
 202
 203        ROTATE_ARGS
 204
 205################################### RND N + 1 ############################
 206
 207        mov     a, y3           # y3 = a                                # MAJA
 208        rorx    $25, e, y0      # y0 = e >> 25                          # S1A
 209        rorx    $11, e, y1      # y1 = e >> 11                          # S1B
 210        offset = \disp + 1*4
 211        addl    offset(%rsp, SRND), h   # h = k + w + h         # --
 212        or      c, y3           # y3 = a|c                              # MAJA
 213
 214
 215        vpsrld  $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
 216        mov     f, y2           # y2 = f                                # CH
 217        rorx    $13, a, T1      # T1 = a >> 13                          # S0B
 218        xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
 219        xor     g, y2           # y2 = f^g                              # CH
 220
 221
 222        rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
 223        xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
 224        rorx    $22, a, y1      # y1 = a >> 22                          # S0A
 225        and     e, y2           # y2 = (f^g)&e                          # CH
 226        add     h, d            # d = k + w + h + d                     # --
 227
 228        vpslld  $(32-18), XTMP1, XTMP1
 229        and     b, y3           # y3 = (a|c)&b                          # MAJA
 230        xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
 231
 232        vpxor   XTMP1, XTMP3, XTMP3
 233        rorx    $2, a, T1       # T1 = (a >> 2)                         # S0
 234        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
 235
 236        vpxor   XTMP2, XTMP3, XTMP3     # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
 237        xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
 238        mov     a, T1           # T1 = a                                # MAJB
 239        and     c, T1           # T1 = a&c                              # MAJB
 240        add     y0, y2          # y2 = S1 + CH                          # --
 241
 242        vpxor   XTMP4, XTMP3, XTMP1     # XTMP1 = s0
 243        vpshufd $0b11111010, X3, XTMP2  # XTMP2 = W[-2] {BBAA}
 244        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
 245        add     y1, h           # h = k + w + h + S0                    # --
 246
 247        vpaddd  XTMP1, XTMP0, XTMP0     # XTMP0 = W[-16] + W[-7] + s0
 248        add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
 249        add     y2, h           # h = k + w + h + S0 + S1 + CH = t1 + S0# --
 250        add     y3, h           # h = t1 + S0 + MAJ                     # --
 251
 252        vpsrld  $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
 253
 254
 255        ROTATE_ARGS
 256
 257################################### RND N + 2 ############################
 258
 259        mov     a, y3           # y3 = a                                # MAJA
 260        rorx    $25, e, y0      # y0 = e >> 25                          # S1A
 261        offset = \disp + 2*4
 262        addl    offset(%rsp, SRND), h   # h = k + w + h         # --
 263
 264        vpsrlq  $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
 265        rorx    $11, e, y1      # y1 = e >> 11                          # S1B
 266        or      c, y3           # y3 = a|c                              # MAJA
 267        mov     f, y2           # y2 = f                                # CH
 268        xor     g, y2           # y2 = f^g                              # CH
 269
 270        rorx    $13, a, T1      # T1 = a >> 13                          # S0B
 271        xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
 272        vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] ror 17 {xBxA}
 273        and     e, y2           # y2 = (f^g)&e                          # CH
 274
 275        rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
 276        vpxor   XTMP3, XTMP2, XTMP2
 277        add     h, d            # d = k + w + h + d                     # --
 278        and     b, y3           # y3 = (a|c)&b                          # MAJA
 279
 280        xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
 281        rorx    $22, a, y1      # y1 = a >> 22                          # S0A
 282        vpxor   XTMP2, XTMP4, XTMP4     # XTMP4 = s1 {xBxA}
 283        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
 284
 285        vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
 286        xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
 287        rorx    $2, a ,T1       # T1 = (a >> 2)                         # S0
 288        vpaddd  XTMP4, XTMP0, XTMP0     # XTMP0 = {..., ..., W[1], W[0]}
 289
 290        xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
 291        mov     a, T1           # T1 = a                                # MAJB
 292        and     c, T1           # T1 = a&c                              # MAJB
 293        add     y0, y2          # y2 = S1 + CH                          # --
 294        vpshufd $0b01010000, XTMP0, XTMP2       # XTMP2 = W[-2] {DDCC}
 295
 296        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
 297        add     y1,h            # h = k + w + h + S0                    # --
 298        add     y2,d            # d = k + w + h + d + S1 + CH = d + t1  # --
 299        add     y2,h            # h = k + w + h + S0 + S1 + CH = t1 + S0# --
 300
 301        add     y3,h            # h = t1 + S0 + MAJ                     # --
 302
 303
 304        ROTATE_ARGS
 305
 306################################### RND N + 3 ############################
 307
 308        mov     a, y3           # y3 = a                                # MAJA
 309        rorx    $25, e, y0      # y0 = e >> 25                          # S1A
 310        rorx    $11, e, y1      # y1 = e >> 11                          # S1B
 311        offset = \disp + 3*4
 312        addl    offset(%rsp, SRND), h   # h = k + w + h         # --
 313        or      c, y3           # y3 = a|c                              # MAJA
 314
 315
 316        vpsrld  $10, XTMP2, XTMP5       # XTMP5 = W[-2] >> 10 {DDCC}
 317        mov     f, y2           # y2 = f                                # CH
 318        rorx    $13, a, T1      # T1 = a >> 13                          # S0B
 319        xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
 320        xor     g, y2           # y2 = f^g                              # CH
 321
 322
 323        vpsrlq  $19, XTMP2, XTMP3       # XTMP3 = W[-2] ror 19 {xDxC}
 324        rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
 325        and     e, y2           # y2 = (f^g)&e                          # CH
 326        add     h, d            # d = k + w + h + d                     # --
 327        and     b, y3           # y3 = (a|c)&b                          # MAJA
 328
 329        vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] ror 17 {xDxC}
 330        xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
 331        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
 332
 333        vpxor   XTMP3, XTMP2, XTMP2
 334        rorx    $22, a, y1      # y1 = a >> 22                          # S0A
 335        add     y0, y2          # y2 = S1 + CH                          # --
 336
 337        vpxor   XTMP2, XTMP5, XTMP5     # XTMP5 = s1 {xDxC}
 338        xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
 339        add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
 340
 341        rorx    $2, a, T1       # T1 = (a >> 2)                         # S0
 342        vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
 343
 344        vpaddd  XTMP0, XTMP5, X0        # X0 = {W[3], W[2], W[1], W[0]}
 345        xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
 346        mov     a, T1           # T1 = a                                # MAJB
 347        and     c, T1           # T1 = a&c                              # MAJB
 348        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
 349
 350        add     y1, h           # h = k + w + h + S0                    # --
 351        add     y2, h           # h = k + w + h + S0 + S1 + CH = t1 + S0# --
 352        add     y3, h           # h = t1 + S0 + MAJ                     # --
 353
 354        ROTATE_ARGS
 355        rotate_Xs
 356.endm
 357
 358.macro DO_4ROUNDS disp
 359################################### RND N + 0 ###########################
 360
 361        mov     f, y2           # y2 = f                                # CH
 362        rorx    $25, e, y0      # y0 = e >> 25                          # S1A
 363        rorx    $11, e, y1      # y1 = e >> 11                          # S1B
 364        xor     g, y2           # y2 = f^g                              # CH
 365
 366        xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
 367        rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
 368        and     e, y2           # y2 = (f^g)&e                          # CH
 369
 370        xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
 371        rorx    $13, a, T1      # T1 = a >> 13                          # S0B
 372        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
 373        rorx    $22, a, y1      # y1 = a >> 22                          # S0A
 374        mov     a, y3           # y3 = a                                # MAJA
 375
 376        xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
 377        rorx    $2, a, T1       # T1 = (a >> 2)                         # S0
 378        addl    \disp(%rsp, SRND), h            # h = k + w + h # --
 379        or      c, y3           # y3 = a|c                              # MAJA
 380
 381        xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
 382        mov     a, T1           # T1 = a                                # MAJB
 383        and     b, y3           # y3 = (a|c)&b                          # MAJA
 384        and     c, T1           # T1 = a&c                              # MAJB
 385        add     y0, y2          # y2 = S1 + CH                          # --
 386
 387
 388        add     h, d            # d = k + w + h + d                     # --
 389        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
 390        add     y1, h           # h = k + w + h + S0                    # --
 391        add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
 392
 393        ROTATE_ARGS
 394
 395################################### RND N + 1 ###########################
 396
 397        add     y2, old_h       # h = k + w + h + S0 + S1 + CH = t1 + S0# --
 398        mov     f, y2           # y2 = f                                # CH
 399        rorx    $25, e, y0      # y0 = e >> 25                          # S1A
 400        rorx    $11, e, y1      # y1 = e >> 11                          # S1B
 401        xor     g, y2           # y2 = f^g                              # CH
 402
 403        xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
 404        rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
 405        and     e, y2           # y2 = (f^g)&e                          # CH
 406        add     y3, old_h       # h = t1 + S0 + MAJ                     # --
 407
 408        xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
 409        rorx    $13, a, T1      # T1 = a >> 13                          # S0B
 410        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
 411        rorx    $22, a, y1      # y1 = a >> 22                          # S0A
 412        mov     a, y3           # y3 = a                                # MAJA
 413
 414        xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
 415        rorx    $2, a, T1       # T1 = (a >> 2)                         # S0
 416        offset = 4*1 + \disp
 417        addl    offset(%rsp, SRND), h           # h = k + w + h # --
 418        or      c, y3           # y3 = a|c                              # MAJA
 419
 420        xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
 421        mov     a, T1           # T1 = a                                # MAJB
 422        and     b, y3           # y3 = (a|c)&b                          # MAJA
 423        and     c, T1           # T1 = a&c                              # MAJB
 424        add     y0, y2          # y2 = S1 + CH                          # --
 425
 426
 427        add     h, d            # d = k + w + h + d                     # --
 428        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
 429        add     y1, h           # h = k + w + h + S0                    # --
 430
 431        add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
 432
 433        ROTATE_ARGS
 434
 435################################### RND N + 2 ##############################
 436
 437        add     y2, old_h       # h = k + w + h + S0 + S1 + CH = t1 + S0# --
 438        mov     f, y2           # y2 = f                                # CH
 439        rorx    $25, e, y0      # y0 = e >> 25                          # S1A
 440        rorx    $11, e, y1      # y1 = e >> 11                          # S1B
 441        xor     g, y2           # y2 = f^g                              # CH
 442
 443        xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
 444        rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
 445        and     e, y2           # y2 = (f^g)&e                          # CH
 446        add     y3, old_h       # h = t1 + S0 + MAJ                     # --
 447
 448        xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
 449        rorx    $13, a, T1      # T1 = a >> 13                          # S0B
 450        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
 451        rorx    $22, a, y1      # y1 = a >> 22                          # S0A
 452        mov     a, y3           # y3 = a                                # MAJA
 453
 454        xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
 455        rorx    $2, a, T1       # T1 = (a >> 2)                         # S0
 456        offset = 4*2 + \disp
 457        addl    offset(%rsp, SRND), h           # h = k + w + h # --
 458        or      c, y3           # y3 = a|c                              # MAJA
 459
 460        xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
 461        mov     a, T1           # T1 = a                                # MAJB
 462        and     b, y3           # y3 = (a|c)&b                          # MAJA
 463        and     c, T1           # T1 = a&c                              # MAJB
 464        add     y0, y2          # y2 = S1 + CH                          # --
 465
 466
 467        add     h, d            # d = k + w + h + d                     # --
 468        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
 469        add     y1, h           # h = k + w + h + S0                    # --
 470
 471        add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
 472
 473        ROTATE_ARGS
 474
 475################################### RND N + 3 ###########################
 476
 477        add     y2, old_h       # h = k + w + h + S0 + S1 + CH = t1 + S0# --
 478        mov     f, y2           # y2 = f                                # CH
 479        rorx    $25, e, y0      # y0 = e >> 25                          # S1A
 480        rorx    $11, e, y1      # y1 = e >> 11                          # S1B
 481        xor     g, y2           # y2 = f^g                              # CH
 482
 483        xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
 484        rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
 485        and     e, y2           # y2 = (f^g)&e                          # CH
 486        add     y3, old_h       # h = t1 + S0 + MAJ                     # --
 487
 488        xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
 489        rorx    $13, a, T1      # T1 = a >> 13                          # S0B
 490        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
 491        rorx    $22, a, y1      # y1 = a >> 22                          # S0A
 492        mov     a, y3           # y3 = a                                # MAJA
 493
 494        xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
 495        rorx    $2, a, T1       # T1 = (a >> 2)                         # S0
 496        offset = 4*3 + \disp
 497        addl    offset(%rsp, SRND), h           # h = k + w + h # --
 498        or      c, y3           # y3 = a|c                              # MAJA
 499
 500        xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
 501        mov     a, T1           # T1 = a                                # MAJB
 502        and     b, y3           # y3 = (a|c)&b                          # MAJA
 503        and     c, T1           # T1 = a&c                              # MAJB
 504        add     y0, y2          # y2 = S1 + CH                          # --
 505
 506
 507        add     h, d            # d = k + w + h + d                     # --
 508        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
 509        add     y1, h           # h = k + w + h + S0                    # --
 510
 511        add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
 512
 513
 514        add     y2, h           # h = k + w + h + S0 + S1 + CH = t1 + S0# --
 515
 516        add     y3, h           # h = t1 + S0 + MAJ                     # --
 517
 518        ROTATE_ARGS
 519
 520.endm
 521
 522########################################################################
 523## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks)
 524## arg 1 : pointer to digest
 525## arg 2 : pointer to input data
 526## arg 3 : Num blocks
 527########################################################################
 528.text
 529ENTRY(sha256_transform_rorx)
 530.align 32
 531        pushq   %rbx
 532        pushq   %r12
 533        pushq   %r13
 534        pushq   %r14
 535        pushq   %r15
 536
 537        mov     %rsp, %rax
 538        subq    $STACK_SIZE, %rsp
 539        and     $-32, %rsp      # align rsp to 32 byte boundary
 540        mov     %rax, _RSP(%rsp)
 541
 542
 543        shl     $6, NUM_BLKS    # convert to bytes
 544        jz      done_hash
 545        lea     -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
 546        mov     NUM_BLKS, _INP_END(%rsp)
 547
 548        cmp     NUM_BLKS, INP
 549        je      only_one_block
 550
 551        ## load initial digest
 552        mov     (CTX), a
 553        mov     4*1(CTX), b
 554        mov     4*2(CTX), c
 555        mov     4*3(CTX), d
 556        mov     4*4(CTX), e
 557        mov     4*5(CTX), f
 558        mov     4*6(CTX), g
 559        mov     4*7(CTX), h
 560
 561        vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
 562        vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
 563        vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
 564
 565        mov     CTX, _CTX(%rsp)
 566
 567loop0:
 568        ## Load first 16 dwords from two blocks
 569        VMOVDQ  0*32(INP),XTMP0
 570        VMOVDQ  1*32(INP),XTMP1
 571        VMOVDQ  2*32(INP),XTMP2
 572        VMOVDQ  3*32(INP),XTMP3
 573
 574        ## byte swap data
 575        vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0
 576        vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1
 577        vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2
 578        vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3
 579
 580        ## transpose data into high/low halves
 581        vperm2i128      $0x20, XTMP2, XTMP0, X0
 582        vperm2i128      $0x31, XTMP2, XTMP0, X1
 583        vperm2i128      $0x20, XTMP3, XTMP1, X2
 584        vperm2i128      $0x31, XTMP3, XTMP1, X3
 585
 586last_block_enter:
 587        add     $64, INP
 588        mov     INP, _INP(%rsp)
 589
 590        ## schedule 48 input dwords, by doing 3 rounds of 12 each
 591        xor     SRND, SRND
 592
 593.align 16
 594loop1:
 595        vpaddd  K256+0*32(SRND), X0, XFER
 596        vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
 597        FOUR_ROUNDS_AND_SCHED   _XFER + 0*32
 598
 599        vpaddd  K256+1*32(SRND), X0, XFER
 600        vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
 601        FOUR_ROUNDS_AND_SCHED   _XFER + 1*32
 602
 603        vpaddd  K256+2*32(SRND), X0, XFER
 604        vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
 605        FOUR_ROUNDS_AND_SCHED   _XFER + 2*32
 606
 607        vpaddd  K256+3*32(SRND), X0, XFER
 608        vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
 609        FOUR_ROUNDS_AND_SCHED   _XFER + 3*32
 610
 611        add     $4*32, SRND
 612        cmp     $3*4*32, SRND
 613        jb      loop1
 614
 615loop2:
 616        ## Do last 16 rounds with no scheduling
 617        vpaddd  K256+0*32(SRND), X0, XFER
 618        vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
 619        DO_4ROUNDS      _XFER + 0*32
 620
 621        vpaddd  K256+1*32(SRND), X1, XFER
 622        vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
 623        DO_4ROUNDS      _XFER + 1*32
 624        add     $2*32, SRND
 625
 626        vmovdqa X2, X0
 627        vmovdqa X3, X1
 628
 629        cmp     $4*4*32, SRND
 630        jb      loop2
 631
 632        mov     _CTX(%rsp), CTX
 633        mov     _INP(%rsp), INP
 634
 635        addm    (4*0)(CTX),a
 636        addm    (4*1)(CTX),b
 637        addm    (4*2)(CTX),c
 638        addm    (4*3)(CTX),d
 639        addm    (4*4)(CTX),e
 640        addm    (4*5)(CTX),f
 641        addm    (4*6)(CTX),g
 642        addm    (4*7)(CTX),h
 643
 644        cmp     _INP_END(%rsp), INP
 645        ja      done_hash
 646
 647        #### Do second block using previously scheduled results
 648        xor     SRND, SRND
 649.align 16
 650loop3:
 651        DO_4ROUNDS       _XFER + 0*32 + 16
 652        DO_4ROUNDS       _XFER + 1*32 + 16
 653        add     $2*32, SRND
 654        cmp     $4*4*32, SRND
 655        jb      loop3
 656
 657        mov     _CTX(%rsp), CTX
 658        mov     _INP(%rsp), INP
 659        add     $64, INP
 660
 661        addm    (4*0)(CTX),a
 662        addm    (4*1)(CTX),b
 663        addm    (4*2)(CTX),c
 664        addm    (4*3)(CTX),d
 665        addm    (4*4)(CTX),e
 666        addm    (4*5)(CTX),f
 667        addm    (4*6)(CTX),g
 668        addm    (4*7)(CTX),h
 669
 670        cmp     _INP_END(%rsp), INP
 671        jb      loop0
 672        ja      done_hash
 673
 674do_last_block:
 675        VMOVDQ  0*16(INP),XWORD0
 676        VMOVDQ  1*16(INP),XWORD1
 677        VMOVDQ  2*16(INP),XWORD2
 678        VMOVDQ  3*16(INP),XWORD3
 679
 680        vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0
 681        vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1
 682        vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
 683        vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
 684
 685        jmp     last_block_enter
 686
 687only_one_block:
 688
 689        ## load initial digest
 690        mov     (4*0)(CTX),a
 691        mov     (4*1)(CTX),b
 692        mov     (4*2)(CTX),c
 693        mov     (4*3)(CTX),d
 694        mov     (4*4)(CTX),e
 695        mov     (4*5)(CTX),f
 696        mov     (4*6)(CTX),g
 697        mov     (4*7)(CTX),h
 698
 699        vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
 700        vmovdqa _SHUF_00BA(%rip), SHUF_00BA
 701        vmovdqa _SHUF_DC00(%rip), SHUF_DC00
 702
 703        mov     CTX, _CTX(%rsp)
 704        jmp     do_last_block
 705
 706done_hash:
 707
 708        mov     _RSP(%rsp), %rsp
 709
 710        popq    %r15
 711        popq    %r14
 712        popq    %r13
 713        popq    %r12
 714        popq    %rbx
 715        ret
 716ENDPROC(sha256_transform_rorx)
 717
 718.section        .rodata.cst512.K256, "aM", @progbits, 512
 719.align 64
 720K256:
 721        .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 722        .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 723        .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 724        .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 725        .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 726        .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 727        .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 728        .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 729        .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 730        .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 731        .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 732        .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 733        .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 734        .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 735        .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 736        .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 737        .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 738        .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 739        .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 740        .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 741        .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 742        .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 743        .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 744        .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 745        .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 746        .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 747        .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 748        .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 749        .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 750        .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 751        .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 752        .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 753
 754.section        .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
 755.align 32
 756PSHUFFLE_BYTE_FLIP_MASK:
 757        .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
 758
 759# shuffle xBxA -> 00BA
 760.section        .rodata.cst32._SHUF_00BA, "aM", @progbits, 32
 761.align 32
 762_SHUF_00BA:
 763        .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
 764
 765# shuffle xDxC -> DC00
 766.section        .rodata.cst32._SHUF_DC00, "aM", @progbits, 32
 767.align 32
 768_SHUF_DC00:
 769        .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
 770
 771#endif
 772