linux/arch/x86/crypto/sha256-avx2-asm.S
<<
>>
Prefs
   1########################################################################
   2# Implement fast SHA-256 with AVX2 instructions. (x86_64)
   3#
   4# Copyright (C) 2013 Intel Corporation.
   5#
   6# Authors:
   7#     James Guilford <james.guilford@intel.com>
   8#     Kirk Yap <kirk.s.yap@intel.com>
   9#     Tim Chen <tim.c.chen@linux.intel.com>
  10#
  11# This software is available to you under a choice of one of two
  12# licenses.  You may choose to be licensed under the terms of the GNU
  13# General Public License (GPL) Version 2, available from the file
  14# COPYING in the main directory of this source tree, or the
  15# OpenIB.org BSD license below:
  16#
  17#     Redistribution and use in source and binary forms, with or
  18#     without modification, are permitted provided that the following
  19#     conditions are met:
  20#
  21#      - Redistributions of source code must retain the above
  22#        copyright notice, this list of conditions and the following
  23#        disclaimer.
  24#
  25#      - Redistributions in binary form must reproduce the above
  26#        copyright notice, this list of conditions and the following
  27#        disclaimer in the documentation and/or other materials
  28#        provided with the distribution.
  29#
  30# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  31# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  32# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  33# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  34# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  35# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  36# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  37# SOFTWARE.
  38#
  39########################################################################
  40#
  41# This code is described in an Intel White-Paper:
  42# "Fast SHA-256 Implementations on Intel Architecture Processors"
  43#
  44# To find it, surf to http://www.intel.com/p/en_US/embedded
  45# and search for that title.
  46#
  47########################################################################
  48# This code schedules 2 blocks at a time, with 4 lanes per block
  49########################################################################
  50
  51#include <linux/linkage.h>
  52
  53## assume buffers not aligned
  54#define VMOVDQ vmovdqu
  55
  56################################ Define Macros
  57
  58# addm [mem], reg
  59# Add reg to mem using reg-mem add and store
  60.macro addm p1 p2
  61        add     \p1, \p2
  62        mov     \p2, \p1
  63.endm
  64
  65################################
  66
  67X0 = %ymm4
  68X1 = %ymm5
  69X2 = %ymm6
  70X3 = %ymm7
  71
  72# XMM versions of above
  73XWORD0 = %xmm4
  74XWORD1 = %xmm5
  75XWORD2 = %xmm6
  76XWORD3 = %xmm7
  77
  78XTMP0 = %ymm0
  79XTMP1 = %ymm1
  80XTMP2 = %ymm2
  81XTMP3 = %ymm3
  82XTMP4 = %ymm8
  83XFER  = %ymm9
  84XTMP5 = %ymm11
  85
  86SHUF_00BA =     %ymm10 # shuffle xBxA -> 00BA
  87SHUF_DC00 =     %ymm12 # shuffle xDxC -> DC00
  88BYTE_FLIP_MASK = %ymm13
  89
  90X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
  91
  92NUM_BLKS = %rdx # 3rd arg
  93INP     = %rsi  # 2nd arg
  94CTX     = %rdi  # 1st arg
  95c       = %ecx
  96d       = %r8d
  97e       = %edx  # clobbers NUM_BLKS
  98y3      = %esi  # clobbers INP
  99
 100SRND    = CTX   # SRND is same register as CTX
 101
 102a = %eax
 103b = %ebx
 104f = %r9d
 105g = %r10d
 106h = %r11d
 107old_h = %r11d
 108
 109T1 = %r12d
 110y0 = %r13d
 111y1 = %r14d
 112y2 = %r15d
 113
 114
 115_XFER_SIZE      = 2*64*4        # 2 blocks, 64 rounds, 4 bytes/round
 116_XMM_SAVE_SIZE  = 0
 117_INP_END_SIZE   = 8
 118_INP_SIZE       = 8
 119_CTX_SIZE       = 8
 120
 121_XFER           = 0
 122_XMM_SAVE       = _XFER     + _XFER_SIZE
 123_INP_END        = _XMM_SAVE + _XMM_SAVE_SIZE
 124_INP            = _INP_END  + _INP_END_SIZE
 125_CTX            = _INP      + _INP_SIZE
 126STACK_SIZE      = _CTX      + _CTX_SIZE
 127
 128# rotate_Xs
 129# Rotate values of symbols X0...X3
 130.macro rotate_Xs
 131        X_ = X0
 132        X0 = X1
 133        X1 = X2
 134        X2 = X3
 135        X3 = X_
 136.endm
 137
 138# ROTATE_ARGS
 139# Rotate values of symbols a...h
 140.macro ROTATE_ARGS
 141        old_h = h
 142        TMP_ = h
 143        h = g
 144        g = f
 145        f = e
 146        e = d
 147        d = c
 148        c = b
 149        b = a
 150        a = TMP_
 151.endm
 152
 153.macro FOUR_ROUNDS_AND_SCHED disp
 154################################### RND N + 0 ############################
 155
 156        mov     a, y3           # y3 = a                                # MAJA
 157        rorx    $25, e, y0      # y0 = e >> 25                          # S1A
 158        rorx    $11, e, y1      # y1 = e >> 11                          # S1B
 159
 160        addl    \disp(%rsp, SRND), h            # h = k + w + h         # --
 161        or      c, y3           # y3 = a|c                              # MAJA
 162        vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
 163        mov     f, y2           # y2 = f                                # CH
 164        rorx    $13, a, T1      # T1 = a >> 13                          # S0B
 165
 166        xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
 167        xor     g, y2           # y2 = f^g                              # CH
 168        vpaddd  X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
 169        rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
 170
 171        and     e, y2           # y2 = (f^g)&e                          # CH
 172        xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
 173        rorx    $22, a, y1      # y1 = a >> 22                          # S0A
 174        add     h, d            # d = k + w + h + d                     # --
 175
 176        and     b, y3           # y3 = (a|c)&b                          # MAJA
 177        vpalignr $4, X0, X1, XTMP1      # XTMP1 = W[-15]
 178        xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
 179        rorx    $2, a, T1       # T1 = (a >> 2)                         # S0
 180
 181        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
 182        vpsrld  $7, XTMP1, XTMP2
 183        xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
 184        mov     a, T1           # T1 = a                                # MAJB
 185        and     c, T1           # T1 = a&c                              # MAJB
 186
 187        add     y0, y2          # y2 = S1 + CH                          # --
 188        vpslld  $(32-7), XTMP1, XTMP3
 189        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
 190        add     y1, h           # h = k + w + h + S0                    # --
 191
 192        add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
 193        vpor    XTMP2, XTMP3, XTMP3     # XTMP3 = W[-15] ror 7
 194
 195        vpsrld  $18, XTMP1, XTMP2
 196        add     y2, h           # h = k + w + h + S0 + S1 + CH = t1 + S0# --
 197        add     y3, h           # h = t1 + S0 + MAJ                     # --
 198
 199
 200        ROTATE_ARGS
 201
 202################################### RND N + 1 ############################
 203
 204        mov     a, y3           # y3 = a                                # MAJA
 205        rorx    $25, e, y0      # y0 = e >> 25                          # S1A
 206        rorx    $11, e, y1      # y1 = e >> 11                          # S1B
 207        offset = \disp + 1*4
 208        addl    offset(%rsp, SRND), h   # h = k + w + h         # --
 209        or      c, y3           # y3 = a|c                              # MAJA
 210
 211
 212        vpsrld  $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
 213        mov     f, y2           # y2 = f                                # CH
 214        rorx    $13, a, T1      # T1 = a >> 13                          # S0B
 215        xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
 216        xor     g, y2           # y2 = f^g                              # CH
 217
 218
 219        rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
 220        xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
 221        rorx    $22, a, y1      # y1 = a >> 22                          # S0A
 222        and     e, y2           # y2 = (f^g)&e                          # CH
 223        add     h, d            # d = k + w + h + d                     # --
 224
 225        vpslld  $(32-18), XTMP1, XTMP1
 226        and     b, y3           # y3 = (a|c)&b                          # MAJA
 227        xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
 228
 229        vpxor   XTMP1, XTMP3, XTMP3
 230        rorx    $2, a, T1       # T1 = (a >> 2)                         # S0
 231        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
 232
 233        vpxor   XTMP2, XTMP3, XTMP3     # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
 234        xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
 235        mov     a, T1           # T1 = a                                # MAJB
 236        and     c, T1           # T1 = a&c                              # MAJB
 237        add     y0, y2          # y2 = S1 + CH                          # --
 238
 239        vpxor   XTMP4, XTMP3, XTMP1     # XTMP1 = s0
 240        vpshufd $0b11111010, X3, XTMP2  # XTMP2 = W[-2] {BBAA}
 241        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
 242        add     y1, h           # h = k + w + h + S0                    # --
 243
 244        vpaddd  XTMP1, XTMP0, XTMP0     # XTMP0 = W[-16] + W[-7] + s0
 245        add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
 246        add     y2, h           # h = k + w + h + S0 + S1 + CH = t1 + S0# --
 247        add     y3, h           # h = t1 + S0 + MAJ                     # --
 248
 249        vpsrld  $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
 250
 251
 252        ROTATE_ARGS
 253
 254################################### RND N + 2 ############################
 255
 256        mov     a, y3           # y3 = a                                # MAJA
 257        rorx    $25, e, y0      # y0 = e >> 25                          # S1A
 258        offset = \disp + 2*4
 259        addl    offset(%rsp, SRND), h   # h = k + w + h         # --
 260
 261        vpsrlq  $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
 262        rorx    $11, e, y1      # y1 = e >> 11                          # S1B
 263        or      c, y3           # y3 = a|c                              # MAJA
 264        mov     f, y2           # y2 = f                                # CH
 265        xor     g, y2           # y2 = f^g                              # CH
 266
 267        rorx    $13, a, T1      # T1 = a >> 13                          # S0B
 268        xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
 269        vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] ror 17 {xBxA}
 270        and     e, y2           # y2 = (f^g)&e                          # CH
 271
 272        rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
 273        vpxor   XTMP3, XTMP2, XTMP2
 274        add     h, d            # d = k + w + h + d                     # --
 275        and     b, y3           # y3 = (a|c)&b                          # MAJA
 276
 277        xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
 278        rorx    $22, a, y1      # y1 = a >> 22                          # S0A
 279        vpxor   XTMP2, XTMP4, XTMP4     # XTMP4 = s1 {xBxA}
 280        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
 281
 282        vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
 283        xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
 284        rorx    $2, a ,T1       # T1 = (a >> 2)                         # S0
 285        vpaddd  XTMP4, XTMP0, XTMP0     # XTMP0 = {..., ..., W[1], W[0]}
 286
 287        xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
 288        mov     a, T1           # T1 = a                                # MAJB
 289        and     c, T1           # T1 = a&c                              # MAJB
 290        add     y0, y2          # y2 = S1 + CH                          # --
 291        vpshufd $0b01010000, XTMP0, XTMP2       # XTMP2 = W[-2] {DDCC}
 292
 293        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
 294        add     y1,h            # h = k + w + h + S0                    # --
 295        add     y2,d            # d = k + w + h + d + S1 + CH = d + t1  # --
 296        add     y2,h            # h = k + w + h + S0 + S1 + CH = t1 + S0# --
 297
 298        add     y3,h            # h = t1 + S0 + MAJ                     # --
 299
 300
 301        ROTATE_ARGS
 302
 303################################### RND N + 3 ############################
 304
 305        mov     a, y3           # y3 = a                                # MAJA
 306        rorx    $25, e, y0      # y0 = e >> 25                          # S1A
 307        rorx    $11, e, y1      # y1 = e >> 11                          # S1B
 308        offset = \disp + 3*4
 309        addl    offset(%rsp, SRND), h   # h = k + w + h         # --
 310        or      c, y3           # y3 = a|c                              # MAJA
 311
 312
 313        vpsrld  $10, XTMP2, XTMP5       # XTMP5 = W[-2] >> 10 {DDCC}
 314        mov     f, y2           # y2 = f                                # CH
 315        rorx    $13, a, T1      # T1 = a >> 13                          # S0B
 316        xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
 317        xor     g, y2           # y2 = f^g                              # CH
 318
 319
 320        vpsrlq  $19, XTMP2, XTMP3       # XTMP3 = W[-2] ror 19 {xDxC}
 321        rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
 322        and     e, y2           # y2 = (f^g)&e                          # CH
 323        add     h, d            # d = k + w + h + d                     # --
 324        and     b, y3           # y3 = (a|c)&b                          # MAJA
 325
 326        vpsrlq  $17, XTMP2, XTMP2       # XTMP2 = W[-2] ror 17 {xDxC}
 327        xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
 328        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
 329
 330        vpxor   XTMP3, XTMP2, XTMP2
 331        rorx    $22, a, y1      # y1 = a >> 22                          # S0A
 332        add     y0, y2          # y2 = S1 + CH                          # --
 333
 334        vpxor   XTMP2, XTMP5, XTMP5     # XTMP5 = s1 {xDxC}
 335        xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
 336        add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
 337
 338        rorx    $2, a, T1       # T1 = (a >> 2)                         # S0
 339        vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
 340
 341        vpaddd  XTMP0, XTMP5, X0        # X0 = {W[3], W[2], W[1], W[0]}
 342        xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
 343        mov     a, T1           # T1 = a                                # MAJB
 344        and     c, T1           # T1 = a&c                              # MAJB
 345        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
 346
 347        add     y1, h           # h = k + w + h + S0                    # --
 348        add     y2, h           # h = k + w + h + S0 + S1 + CH = t1 + S0# --
 349        add     y3, h           # h = t1 + S0 + MAJ                     # --
 350
 351        ROTATE_ARGS
 352        rotate_Xs
 353.endm
 354
 355.macro DO_4ROUNDS disp
 356################################### RND N + 0 ###########################
 357
 358        mov     f, y2           # y2 = f                                # CH
 359        rorx    $25, e, y0      # y0 = e >> 25                          # S1A
 360        rorx    $11, e, y1      # y1 = e >> 11                          # S1B
 361        xor     g, y2           # y2 = f^g                              # CH
 362
 363        xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
 364        rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
 365        and     e, y2           # y2 = (f^g)&e                          # CH
 366
 367        xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
 368        rorx    $13, a, T1      # T1 = a >> 13                          # S0B
 369        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
 370        rorx    $22, a, y1      # y1 = a >> 22                          # S0A
 371        mov     a, y3           # y3 = a                                # MAJA
 372
 373        xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
 374        rorx    $2, a, T1       # T1 = (a >> 2)                         # S0
 375        addl    \disp(%rsp, SRND), h            # h = k + w + h # --
 376        or      c, y3           # y3 = a|c                              # MAJA
 377
 378        xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
 379        mov     a, T1           # T1 = a                                # MAJB
 380        and     b, y3           # y3 = (a|c)&b                          # MAJA
 381        and     c, T1           # T1 = a&c                              # MAJB
 382        add     y0, y2          # y2 = S1 + CH                          # --
 383
 384
 385        add     h, d            # d = k + w + h + d                     # --
 386        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
 387        add     y1, h           # h = k + w + h + S0                    # --
 388        add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
 389
 390        ROTATE_ARGS
 391
 392################################### RND N + 1 ###########################
 393
 394        add     y2, old_h       # h = k + w + h + S0 + S1 + CH = t1 + S0# --
 395        mov     f, y2           # y2 = f                                # CH
 396        rorx    $25, e, y0      # y0 = e >> 25                          # S1A
 397        rorx    $11, e, y1      # y1 = e >> 11                          # S1B
 398        xor     g, y2           # y2 = f^g                              # CH
 399
 400        xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
 401        rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
 402        and     e, y2           # y2 = (f^g)&e                          # CH
 403        add     y3, old_h       # h = t1 + S0 + MAJ                     # --
 404
 405        xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
 406        rorx    $13, a, T1      # T1 = a >> 13                          # S0B
 407        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
 408        rorx    $22, a, y1      # y1 = a >> 22                          # S0A
 409        mov     a, y3           # y3 = a                                # MAJA
 410
 411        xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
 412        rorx    $2, a, T1       # T1 = (a >> 2)                         # S0
 413        offset = 4*1 + \disp
 414        addl    offset(%rsp, SRND), h           # h = k + w + h # --
 415        or      c, y3           # y3 = a|c                              # MAJA
 416
 417        xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
 418        mov     a, T1           # T1 = a                                # MAJB
 419        and     b, y3           # y3 = (a|c)&b                          # MAJA
 420        and     c, T1           # T1 = a&c                              # MAJB
 421        add     y0, y2          # y2 = S1 + CH                          # --
 422
 423
 424        add     h, d            # d = k + w + h + d                     # --
 425        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
 426        add     y1, h           # h = k + w + h + S0                    # --
 427
 428        add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
 429
 430        ROTATE_ARGS
 431
 432################################### RND N + 2 ##############################
 433
 434        add     y2, old_h       # h = k + w + h + S0 + S1 + CH = t1 + S0# --
 435        mov     f, y2           # y2 = f                                # CH
 436        rorx    $25, e, y0      # y0 = e >> 25                          # S1A
 437        rorx    $11, e, y1      # y1 = e >> 11                          # S1B
 438        xor     g, y2           # y2 = f^g                              # CH
 439
 440        xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
 441        rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
 442        and     e, y2           # y2 = (f^g)&e                          # CH
 443        add     y3, old_h       # h = t1 + S0 + MAJ                     # --
 444
 445        xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
 446        rorx    $13, a, T1      # T1 = a >> 13                          # S0B
 447        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
 448        rorx    $22, a, y1      # y1 = a >> 22                          # S0A
 449        mov     a, y3           # y3 = a                                # MAJA
 450
 451        xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
 452        rorx    $2, a, T1       # T1 = (a >> 2)                         # S0
 453        offset = 4*2 + \disp
 454        addl    offset(%rsp, SRND), h           # h = k + w + h # --
 455        or      c, y3           # y3 = a|c                              # MAJA
 456
 457        xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
 458        mov     a, T1           # T1 = a                                # MAJB
 459        and     b, y3           # y3 = (a|c)&b                          # MAJA
 460        and     c, T1           # T1 = a&c                              # MAJB
 461        add     y0, y2          # y2 = S1 + CH                          # --
 462
 463
 464        add     h, d            # d = k + w + h + d                     # --
 465        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
 466        add     y1, h           # h = k + w + h + S0                    # --
 467
 468        add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
 469
 470        ROTATE_ARGS
 471
 472################################### RND N + 3 ###########################
 473
 474        add     y2, old_h       # h = k + w + h + S0 + S1 + CH = t1 + S0# --
 475        mov     f, y2           # y2 = f                                # CH
 476        rorx    $25, e, y0      # y0 = e >> 25                          # S1A
 477        rorx    $11, e, y1      # y1 = e >> 11                          # S1B
 478        xor     g, y2           # y2 = f^g                              # CH
 479
 480        xor     y1, y0          # y0 = (e>>25) ^ (e>>11)                # S1
 481        rorx    $6, e, y1       # y1 = (e >> 6)                         # S1
 482        and     e, y2           # y2 = (f^g)&e                          # CH
 483        add     y3, old_h       # h = t1 + S0 + MAJ                     # --
 484
 485        xor     y1, y0          # y0 = (e>>25) ^ (e>>11) ^ (e>>6)       # S1
 486        rorx    $13, a, T1      # T1 = a >> 13                          # S0B
 487        xor     g, y2           # y2 = CH = ((f^g)&e)^g                 # CH
 488        rorx    $22, a, y1      # y1 = a >> 22                          # S0A
 489        mov     a, y3           # y3 = a                                # MAJA
 490
 491        xor     T1, y1          # y1 = (a>>22) ^ (a>>13)                # S0
 492        rorx    $2, a, T1       # T1 = (a >> 2)                         # S0
 493        offset = 4*3 + \disp
 494        addl    offset(%rsp, SRND), h           # h = k + w + h # --
 495        or      c, y3           # y3 = a|c                              # MAJA
 496
 497        xor     T1, y1          # y1 = (a>>22) ^ (a>>13) ^ (a>>2)       # S0
 498        mov     a, T1           # T1 = a                                # MAJB
 499        and     b, y3           # y3 = (a|c)&b                          # MAJA
 500        and     c, T1           # T1 = a&c                              # MAJB
 501        add     y0, y2          # y2 = S1 + CH                          # --
 502
 503
 504        add     h, d            # d = k + w + h + d                     # --
 505        or      T1, y3          # y3 = MAJ = (a|c)&b)|(a&c)             # MAJ
 506        add     y1, h           # h = k + w + h + S0                    # --
 507
 508        add     y2, d           # d = k + w + h + d + S1 + CH = d + t1  # --
 509
 510
 511        add     y2, h           # h = k + w + h + S0 + S1 + CH = t1 + S0# --
 512
 513        add     y3, h           # h = t1 + S0 + MAJ                     # --
 514
 515        ROTATE_ARGS
 516
 517.endm
 518
 519########################################################################
 520## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks)
 521## arg 1 : pointer to state
 522## arg 2 : pointer to input data
 523## arg 3 : Num blocks
 524########################################################################
 525.text
 526SYM_FUNC_START(sha256_transform_rorx)
 527.align 32
 528        pushq   %rbx
 529        pushq   %r12
 530        pushq   %r13
 531        pushq   %r14
 532        pushq   %r15
 533
 534        push    %rbp
 535        mov     %rsp, %rbp
 536
 537        subq    $STACK_SIZE, %rsp
 538        and     $-32, %rsp      # align rsp to 32 byte boundary
 539
 540        shl     $6, NUM_BLKS    # convert to bytes
 541        jz      done_hash
 542        lea     -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
 543        mov     NUM_BLKS, _INP_END(%rsp)
 544
 545        cmp     NUM_BLKS, INP
 546        je      only_one_block
 547
 548        ## load initial digest
 549        mov     (CTX), a
 550        mov     4*1(CTX), b
 551        mov     4*2(CTX), c
 552        mov     4*3(CTX), d
 553        mov     4*4(CTX), e
 554        mov     4*5(CTX), f
 555        mov     4*6(CTX), g
 556        mov     4*7(CTX), h
 557
 558        vmovdqa  PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
 559        vmovdqa  _SHUF_00BA(%rip), SHUF_00BA
 560        vmovdqa  _SHUF_DC00(%rip), SHUF_DC00
 561
 562        mov     CTX, _CTX(%rsp)
 563
 564loop0:
 565        ## Load first 16 dwords from two blocks
 566        VMOVDQ  0*32(INP),XTMP0
 567        VMOVDQ  1*32(INP),XTMP1
 568        VMOVDQ  2*32(INP),XTMP2
 569        VMOVDQ  3*32(INP),XTMP3
 570
 571        ## byte swap data
 572        vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0
 573        vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1
 574        vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2
 575        vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3
 576
 577        ## transpose data into high/low halves
 578        vperm2i128      $0x20, XTMP2, XTMP0, X0
 579        vperm2i128      $0x31, XTMP2, XTMP0, X1
 580        vperm2i128      $0x20, XTMP3, XTMP1, X2
 581        vperm2i128      $0x31, XTMP3, XTMP1, X3
 582
 583last_block_enter:
 584        add     $64, INP
 585        mov     INP, _INP(%rsp)
 586
 587        ## schedule 48 input dwords, by doing 3 rounds of 12 each
 588        xor     SRND, SRND
 589
 590.align 16
 591loop1:
 592        vpaddd  K256+0*32(SRND), X0, XFER
 593        vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
 594        FOUR_ROUNDS_AND_SCHED   _XFER + 0*32
 595
 596        vpaddd  K256+1*32(SRND), X0, XFER
 597        vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
 598        FOUR_ROUNDS_AND_SCHED   _XFER + 1*32
 599
 600        vpaddd  K256+2*32(SRND), X0, XFER
 601        vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
 602        FOUR_ROUNDS_AND_SCHED   _XFER + 2*32
 603
 604        vpaddd  K256+3*32(SRND), X0, XFER
 605        vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
 606        FOUR_ROUNDS_AND_SCHED   _XFER + 3*32
 607
 608        add     $4*32, SRND
 609        cmp     $3*4*32, SRND
 610        jb      loop1
 611
 612loop2:
 613        ## Do last 16 rounds with no scheduling
 614        vpaddd  K256+0*32(SRND), X0, XFER
 615        vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
 616        DO_4ROUNDS      _XFER + 0*32
 617
 618        vpaddd  K256+1*32(SRND), X1, XFER
 619        vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
 620        DO_4ROUNDS      _XFER + 1*32
 621        add     $2*32, SRND
 622
 623        vmovdqa X2, X0
 624        vmovdqa X3, X1
 625
 626        cmp     $4*4*32, SRND
 627        jb      loop2
 628
 629        mov     _CTX(%rsp), CTX
 630        mov     _INP(%rsp), INP
 631
 632        addm    (4*0)(CTX),a
 633        addm    (4*1)(CTX),b
 634        addm    (4*2)(CTX),c
 635        addm    (4*3)(CTX),d
 636        addm    (4*4)(CTX),e
 637        addm    (4*5)(CTX),f
 638        addm    (4*6)(CTX),g
 639        addm    (4*7)(CTX),h
 640
 641        cmp     _INP_END(%rsp), INP
 642        ja      done_hash
 643
 644        #### Do second block using previously scheduled results
 645        xor     SRND, SRND
 646.align 16
 647loop3:
 648        DO_4ROUNDS       _XFER + 0*32 + 16
 649        DO_4ROUNDS       _XFER + 1*32 + 16
 650        add     $2*32, SRND
 651        cmp     $4*4*32, SRND
 652        jb      loop3
 653
 654        mov     _CTX(%rsp), CTX
 655        mov     _INP(%rsp), INP
 656        add     $64, INP
 657
 658        addm    (4*0)(CTX),a
 659        addm    (4*1)(CTX),b
 660        addm    (4*2)(CTX),c
 661        addm    (4*3)(CTX),d
 662        addm    (4*4)(CTX),e
 663        addm    (4*5)(CTX),f
 664        addm    (4*6)(CTX),g
 665        addm    (4*7)(CTX),h
 666
 667        cmp     _INP_END(%rsp), INP
 668        jb      loop0
 669        ja      done_hash
 670
 671do_last_block:
 672        VMOVDQ  0*16(INP),XWORD0
 673        VMOVDQ  1*16(INP),XWORD1
 674        VMOVDQ  2*16(INP),XWORD2
 675        VMOVDQ  3*16(INP),XWORD3
 676
 677        vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0
 678        vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1
 679        vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
 680        vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
 681
 682        jmp     last_block_enter
 683
 684only_one_block:
 685
 686        ## load initial digest
 687        mov     (4*0)(CTX),a
 688        mov     (4*1)(CTX),b
 689        mov     (4*2)(CTX),c
 690        mov     (4*3)(CTX),d
 691        mov     (4*4)(CTX),e
 692        mov     (4*5)(CTX),f
 693        mov     (4*6)(CTX),g
 694        mov     (4*7)(CTX),h
 695
 696        vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
 697        vmovdqa _SHUF_00BA(%rip), SHUF_00BA
 698        vmovdqa _SHUF_DC00(%rip), SHUF_DC00
 699
 700        mov     CTX, _CTX(%rsp)
 701        jmp     do_last_block
 702
 703done_hash:
 704
 705        mov     %rbp, %rsp
 706        pop     %rbp
 707
 708        popq    %r15
 709        popq    %r14
 710        popq    %r13
 711        popq    %r12
 712        popq    %rbx
 713        ret
 714SYM_FUNC_END(sha256_transform_rorx)
 715
 716.section        .rodata.cst512.K256, "aM", @progbits, 512
 717.align 64
 718K256:
 719        .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 720        .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 721        .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 722        .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 723        .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 724        .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 725        .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 726        .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 727        .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 728        .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 729        .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 730        .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 731        .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 732        .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 733        .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 734        .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 735        .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 736        .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 737        .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 738        .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 739        .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 740        .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 741        .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 742        .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 743        .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 744        .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 745        .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 746        .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 747        .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 748        .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 749        .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 750        .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 751
 752.section        .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
 753.align 32
 754PSHUFFLE_BYTE_FLIP_MASK:
 755        .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
 756
 757# shuffle xBxA -> 00BA
 758.section        .rodata.cst32._SHUF_00BA, "aM", @progbits, 32
 759.align 32
 760_SHUF_00BA:
 761        .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
 762
 763# shuffle xDxC -> DC00
 764.section        .rodata.cst32._SHUF_DC00, "aM", @progbits, 32
 765.align 32
 766_SHUF_DC00:
 767        .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
 768